Total coverage: 146860 (8%)of 1859164
12 61 61 9 62 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H #include <linux/blk-mq.h> #include "blk-stat.h" struct blk_mq_tag_set; struct blk_mq_ctxs { struct kobject kobj; struct blk_mq_ctx __percpu *queue_ctx; }; /** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; struct list_head rq_lists[HCTX_MAX_TYPES]; } ____cacheline_aligned_in_smp; unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; enum { BLK_MQ_NO_TAG = -1U, BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; #define BLK_MQ_CPU_WORK_BATCH (8) typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, unsigned int); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); /* * CPU -> queue mappings */ extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); /* * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue * @q: request queue * @type: the hctx type index * @cpu: CPU */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, enum hctx_type type, unsigned int cpu) { return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_POLLED, poll must be enabled. */ if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return type; } /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* * sysfs helpers */ extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); int blk_mq_sysfs_register(struct gendisk *disk); void blk_mq_sysfs_unregister(struct gendisk *disk); int blk_mq_sysfs_register_hctxs(struct request_queue *q); void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { return per_cpu_ptr(q->queue_ctx, cpu); } /* * This assumes per-cpu software queueing queues. They could be per-node * as well, for instance. For now this is hardcoded as-is. Note that we don't * care about preemption, since we know the ctx's are persistent. This does * mean that we can't rely on ctx always matching the currently running CPU. */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; blk_opf_t cmd_flags; req_flags_t rq_flags; /* allocate multiple requests/tags in one go */ unsigned int nr_tags; struct rq_list *cached_rqs; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, unsigned int flags, int node); void blk_mq_free_tags(struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset); void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; return sbq_wait_ptr(bt, &hctx->wait_index); } void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_idle(hctx); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { return tag < tags->nr_reserved_tags; } static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; } static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->rq_flags & RQF_SCHED_TAGS) return data->hctx->sched_tags; return data->hctx->tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { /* Fast path: hardware queue is not stopped most of the time. */ if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return false; /* * This barrier is used to order adding of dispatch list before and * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier * in blk_mq_start_stopped_hw_queue() so that dispatch code could * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not * empty to avoid missing dispatching requests. */ smp_mb(); return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { return hctx->nr_ctx && hctx->tags; } unsigned int blk_mq_in_flight(struct request_queue *q, struct block_device *part); void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) { if (q->mq_ops->put_budget) q->mq_ops->put_budget(q, budget_token); } static inline int blk_mq_get_dispatch_budget(struct request_queue *q) { if (q->mq_ops->get_budget) return q->mq_ops->get_budget(q); return 0; } static inline void blk_mq_set_rq_budget_token(struct request *rq, int token) { if (token < 0) return; if (rq->q->mq_ops->set_rq_budget_token) rq->q->mq_ops->set_rq_budget_token(rq, token); } static inline int blk_mq_get_rq_budget_token(struct request *rq) { if (rq->q->mq_ops->get_rq_budget_token) return rq->q->mq_ops->get_rq_budget_token(rq); return -1; } static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_add(val, &hctx->nr_active); } static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_add_active_requests(hctx, 1); } static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_sub(val, &hctx->nr_active); } static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_sub_active_requests(hctx, 1); } static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_add_active_requests(hctx, val); } static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_inc_active_requests(hctx); } static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_sub_active_requests(hctx, val); } static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_dec_active_requests(hctx); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) return atomic_read(&hctx->queue->nr_active_requests_shared_tags); return atomic_read(&hctx->nr_active); } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; } static inline void blk_mq_put_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) return; __blk_mq_put_driver_tag(rq->mq_hctx, rq); } bool __blk_mq_alloc_driver_tag(struct request *rq); static inline bool blk_mq_get_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; return true; } static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; for_each_possible_cpu(cpu) qmap->mq_map[cpu] = 0; } /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { while (!list_empty(list)) { struct request *rq = list_entry_rq(list->next); list_del_init(&rq->queuelist); blk_mq_free_request(rq); } } /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { unsigned int depth, users; if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return true; /* * Don't try dividing an ant */ if (bt->sb.depth == 1) return true; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; } users = READ_ONCE(hctx->tags->active_queues); if (!users) return true; /* * Allow at least some tags */ depth = max((bt->sb.depth + users - 1) / users, 4U); return __blk_mq_active_requests(hctx) < depth; } /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ struct blk_mq_tag_set *__tag_set = (q)->tag_set; \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ srcu_idx = srcu_read_lock(__tag_set->srcu); \ (dispatch_ops); \ srcu_read_unlock(__tag_set->srcu, srcu_idx); \ } else { \ rcu_read_lock(); \ (dispatch_ops); \ rcu_read_unlock(); \ } \ } while (0) #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ static inline bool blk_mq_can_poll(struct request_queue *q) { return (q->limits.features & BLK_FEAT_POLL) && q->tag_set->map[HCTX_TYPE_POLL].nr_queues; } #endif
3 3 12 12 9 2 12 1 8 6 3 3 1 4 2 2 4 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Range as input support for IPv4 added */ /* 2 nomatch flag support added */ /* 3 Counters support added */ /* 4 Comments support added */ /* 5 Forceadd support added */ /* 6 skbinfo support added */ #define IPSET_TYPE_REV_MAX 7 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net"); /* Type specific function prefix */ #define HTYPE hash_net #define IP_SET_HASH_WITH_NETS /* IPv4 variant */ /* Member elements */ struct hash_net4_elem { __be32 ip; u16 padding0; u8 nomatch; u8 cidr; }; /* Common functions */ static bool hash_net4_data_equal(const struct hash_net4_elem *ip1, const struct hash_net4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->cidr == ip2->cidr; } static int hash_net4_do_data_match(const struct hash_net4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static void hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); elem->cidr = cidr; } static bool hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_net4_data_next(struct hash_net4_elem *next, const struct hash_net4_elem *d) { next->ip = d->ip; } #define MTYPE hash_net4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_net4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) e.cidr = HOST_MASK; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= ip_set_netmask(e.cidr); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_net4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!e.cidr || e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip_to < ip) swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } if (retried) ip = ntohl(h->next.ip); do { i++; e.ip = htonl(ip); if (i > IPSET_MAX_RANGE) { hash_net4_data_next(&h->next, &e); return -ERANGE; } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } while (ip++ < ip_to); return ret; } /* IPv6 variant */ struct hash_net6_elem { union nf_inet_addr ip; u16 padding0; u8 nomatch; u8 cidr; }; /* Common functions */ static bool hash_net6_data_equal(const struct hash_net6_elem *ip1, const struct hash_net6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->cidr == ip2->cidr; } static int hash_net6_do_data_match(const struct hash_net6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static void hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); elem->cidr = cidr; } static bool hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_net6_data_next(struct hash_net6_elem *next, const struct hash_net6_elem *d) { } #undef MTYPE #undef HOST_MASK #define MTYPE hash_net6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_net6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net6_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (e.cidr == 0) return -EINVAL; if (adt == IPSET_TEST) e.cidr = HOST_MASK; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6_netmask(&e.ip, e.cidr); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net6_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!e.cidr || e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ip6_netmask(&e.ip, e.cidr); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_net_type __read_mostly = { .name = "hash:net", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_ONE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_net_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_net_init(void) { return ip_set_type_register(&hash_net_type); } static void __exit hash_net_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_net_type); } module_init(hash_net_init); module_exit(hash_net_fini);
9 7 3 1 3 7 1 6 1 6 6 3 3 1 2 2 1 1 12 12 19 16 5 1 10 6 8 9 9 15 1 14 1 13 14 1 13 9 9 9 1 8 8 7 7 8 8 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 // SPDX-License-Identifier: GPL-2.0-or-later /* net/sched/sch_ingress.c - Ingress and clsact qdisc * * Authors: Jamal Hadi Salim 1999 */ #include <linux/module.h> #include <linux/types.h> #include <linux/list.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tcx.h> struct ingress_sched_data { struct tcf_block *block; struct tcf_block_ext_info block_info; struct mini_Qdisc_pair miniqp; }; static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } static unsigned long ingress_find(struct Qdisc *sch, u32 classid) { return TC_H_MIN(classid) + 1; } static unsigned long ingress_bind_filter(struct Qdisc *sch, unsigned long parent, u32 classid) { return ingress_find(sch, classid); } static void ingress_unbind_filter(struct Qdisc *sch, unsigned long cl) { } static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) { } static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct ingress_sched_data *q = qdisc_priv(sch); return q->block; } static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv) { struct mini_Qdisc_pair *miniqp = priv; mini_qdisc_pair_swap(miniqp, tp_head); }; static void ingress_ingress_block_set(struct Qdisc *sch, u32 block_index) { struct ingress_sched_data *q = qdisc_priv(sch); q->block_info.block_index = block_index; } static u32 ingress_ingress_block_get(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); return q->block_info.block_index; } static int ingress_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct bpf_mprog_entry *entry; bool created; int err; if (sch->parent != TC_H_INGRESS) return -EOPNOTSUPP; net_inc_ingress_queue(); entry = tcx_entry_fetch_or_create(dev, true, &created); if (!entry) return -ENOMEM; tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, true); q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->block_info.chain_head_change = clsact_chain_head_change; q->block_info.chain_head_change_priv = &q->miniqp; err = tcf_block_get_ext(&q->block, sch, &q->block_info, extack); if (err) return err; mini_qdisc_pair_block_init(&q->miniqp, q->block); return 0; } static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress); if (sch->parent != TC_H_INGRESS) return; tcf_block_put_ext(q->block, sch, &q->block_info); if (entry) { tcx_miniq_dec(entry); if (!tcx_entry_is_active(entry)) { tcx_entry_update(dev, NULL, true); tcx_entry_free(entry); } } net_dec_ingress_queue(); } static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static const struct Qdisc_class_ops ingress_class_ops = { .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED, .leaf = ingress_leaf, .find = ingress_find, .walk = ingress_walk, .tcf_block = ingress_tcf_block, .bind_tcf = ingress_bind_filter, .unbind_tcf = ingress_unbind_filter, }; static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", .priv_size = sizeof(struct ingress_sched_data), .static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS, .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, .ingress_block_set = ingress_ingress_block_set, .ingress_block_get = ingress_ingress_block_get, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("ingress"); struct clsact_sched_data { struct tcf_block *ingress_block; struct tcf_block *egress_block; struct tcf_block_ext_info ingress_block_info; struct tcf_block_ext_info egress_block_info; struct mini_Qdisc_pair miniqp_ingress; struct mini_Qdisc_pair miniqp_egress; }; static unsigned long clsact_find(struct Qdisc *sch, u32 classid) { switch (TC_H_MIN(classid)) { case TC_H_MIN(TC_H_MIN_INGRESS): case TC_H_MIN(TC_H_MIN_EGRESS): return TC_H_MIN(classid); default: return 0; } } static unsigned long clsact_bind_filter(struct Qdisc *sch, unsigned long parent, u32 classid) { return clsact_find(sch, classid); } static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct clsact_sched_data *q = qdisc_priv(sch); switch (cl) { case TC_H_MIN(TC_H_MIN_INGRESS): return q->ingress_block; case TC_H_MIN(TC_H_MIN_EGRESS): return q->egress_block; default: return NULL; } } static void clsact_ingress_block_set(struct Qdisc *sch, u32 block_index) { struct clsact_sched_data *q = qdisc_priv(sch); q->ingress_block_info.block_index = block_index; } static void clsact_egress_block_set(struct Qdisc *sch, u32 block_index) { struct clsact_sched_data *q = qdisc_priv(sch); q->egress_block_info.block_index = block_index; } static u32 clsact_ingress_block_get(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); return q->ingress_block_info.block_index; } static u32 clsact_egress_block_get(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); return q->egress_block_info.block_index; } static int clsact_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct clsact_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct bpf_mprog_entry *entry; bool created; int err; if (sch->parent != TC_H_CLSACT) return -EOPNOTSUPP; net_inc_ingress_queue(); net_inc_egress_queue(); entry = tcx_entry_fetch_or_create(dev, true, &created); if (!entry) return -ENOMEM; tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, true); q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->ingress_block_info.chain_head_change = clsact_chain_head_change; q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress; err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info, extack); if (err) return err; mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block); entry = tcx_entry_fetch_or_create(dev, false, &created); if (!entry) return -ENOMEM; tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, false); q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS; q->egress_block_info.chain_head_change = clsact_chain_head_change; q->egress_block_info.chain_head_change_priv = &q->miniqp_egress; return tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info, extack); } static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct bpf_mprog_entry *ingress_entry = rtnl_dereference(dev->tcx_ingress); struct bpf_mprog_entry *egress_entry = rtnl_dereference(dev->tcx_egress); if (sch->parent != TC_H_CLSACT) return; tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); if (ingress_entry) { tcx_miniq_dec(ingress_entry); if (!tcx_entry_is_active(ingress_entry)) { tcx_entry_update(dev, NULL, true); tcx_entry_free(ingress_entry); } } if (egress_entry) { tcx_miniq_dec(egress_entry); if (!tcx_entry_is_active(egress_entry)) { tcx_entry_update(dev, NULL, false); tcx_entry_free(egress_entry); } } net_dec_ingress_queue(); net_dec_egress_queue(); } static const struct Qdisc_class_ops clsact_class_ops = { .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED, .leaf = ingress_leaf, .find = clsact_find, .walk = ingress_walk, .tcf_block = clsact_tcf_block, .bind_tcf = clsact_bind_filter, .unbind_tcf = ingress_unbind_filter, }; static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { .cl_ops = &clsact_class_ops, .id = "clsact", .priv_size = sizeof(struct clsact_sched_data), .static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS, .init = clsact_init, .destroy = clsact_destroy, .dump = ingress_dump, .ingress_block_set = clsact_ingress_block_set, .egress_block_set = clsact_egress_block_set, .ingress_block_get = clsact_ingress_block_get, .egress_block_get = clsact_egress_block_get, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("clsact"); static int __init ingress_module_init(void) { int ret; ret = register_qdisc(&ingress_qdisc_ops); if (!ret) { ret = register_qdisc(&clsact_qdisc_ops); if (ret) unregister_qdisc(&ingress_qdisc_ops); } return ret; } static void __exit ingress_module_exit(void) { unregister_qdisc(&ingress_qdisc_ops); unregister_qdisc(&clsact_qdisc_ops); } module_init(ingress_module_init); module_exit(ingress_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Ingress and clsact based ingress and egress qdiscs");
3 3 3 3 3 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 // SPDX-License-Identifier: GPL-2.0-only /* * MAC commands interface * * Copyright 2007-2012 Siemens AG * * Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/ieee802154.h> #include <net/ieee802154_netdev.h> #include <net/cfg802154.h> #include <net/mac802154.h> #include "ieee802154_i.h" #include "driver-ops.h" static int mac802154_mlme_start_req(struct net_device *dev, struct ieee802154_addr *addr, u8 channel, u8 page, u8 bcn_ord, u8 sf_ord, u8 pan_coord, u8 blx, u8 coord_realign) { struct ieee802154_llsec_params params; int changed = 0; ASSERT_RTNL(); BUG_ON(addr->mode != IEEE802154_ADDR_SHORT); dev->ieee802154_ptr->pan_id = addr->pan_id; dev->ieee802154_ptr->short_addr = addr->short_addr; mac802154_dev_set_page_channel(dev, page, channel); params.pan_id = addr->pan_id; changed |= IEEE802154_LLSEC_PARAM_PAN_ID; params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr); changed |= IEEE802154_LLSEC_PARAM_HWADDR; params.coord_hwaddr = params.hwaddr; changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR; params.coord_shortaddr = addr->short_addr; changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR; return mac802154_set_params(dev, &params, changed); } static int mac802154_set_mac_params(struct net_device *dev, const struct ieee802154_mac_params *params) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; ASSERT_RTNL(); /* PHY */ wpan_dev->wpan_phy->transmit_power = params->transmit_power; wpan_dev->wpan_phy->cca = params->cca; wpan_dev->wpan_phy->cca_ed_level = params->cca_ed_level; /* MAC */ wpan_dev->min_be = params->min_be; wpan_dev->max_be = params->max_be; wpan_dev->csma_retries = params->csma_retries; wpan_dev->frame_retries = params->frame_retries; wpan_dev->lbt = params->lbt; if (local->hw.phy->flags & WPAN_PHY_FLAG_TXPOWER) { ret = drv_set_tx_power(local, params->transmit_power); if (ret < 0) return ret; } if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_MODE) { ret = drv_set_cca_mode(local, &params->cca); if (ret < 0) return ret; } if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { ret = drv_set_cca_ed_level(local, params->cca_ed_level); if (ret < 0) return ret; } return 0; } static void mac802154_get_mac_params(struct net_device *dev, struct ieee802154_mac_params *params) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; ASSERT_RTNL(); /* PHY */ params->transmit_power = wpan_dev->wpan_phy->transmit_power; params->cca = wpan_dev->wpan_phy->cca; params->cca_ed_level = wpan_dev->wpan_phy->cca_ed_level; /* MAC */ params->min_be = wpan_dev->min_be; params->max_be = wpan_dev->max_be; params->csma_retries = wpan_dev->csma_retries; params->frame_retries = wpan_dev->frame_retries; params->lbt = wpan_dev->lbt; } static const struct ieee802154_llsec_ops mac802154_llsec_ops = { .get_params = mac802154_get_params, .set_params = mac802154_set_params, .add_key = mac802154_add_key, .del_key = mac802154_del_key, .add_dev = mac802154_add_dev, .del_dev = mac802154_del_dev, .add_devkey = mac802154_add_devkey, .del_devkey = mac802154_del_devkey, .add_seclevel = mac802154_add_seclevel, .del_seclevel = mac802154_del_seclevel, .lock_table = mac802154_lock_table, .get_table = mac802154_get_table, .unlock_table = mac802154_unlock_table, }; struct ieee802154_mlme_ops mac802154_mlme_wpan = { .start_req = mac802154_mlme_start_req, .llsec = &mac802154_llsec_ops, .set_mac_params = mac802154_set_mac_params, .get_mac_params = mac802154_get_mac_params, };
21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 /* SPDX-License-Identifier: GPL-2.0 */ /* * XDR standard data types and function declarations * * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> * * Based on: * RFC 4506 "XDR: External Data Representation Standard", May 2006 */ #ifndef _SUNRPC_XDR_H_ #define _SUNRPC_XDR_H_ #include <linux/uio.h> #include <asm/byteorder.h> #include <linux/unaligned.h> #include <linux/scatterlist.h> struct bio_vec; struct rpc_rqst; /* * Size of an XDR encoding unit in bytes, i.e. 32 bits, * as defined in Section 3 of RFC 4506. All encoded * XDR data items are aligned on a boundary of 32 bits. */ #define XDR_UNIT sizeof(__be32) /* * Buffer adjustment */ #define XDR_QUADLEN(l) (((l) + 3) >> 2) /* * Generic opaque `network object.' */ #define XDR_MAX_NETOBJ 1024 struct xdr_netobj { unsigned int len; u8 * data; }; /* * Basic structure for transmission/reception of a client XDR message. * Features a header (for a linear buffer containing RPC headers * and the data payload for short messages), and then an array of * pages. * The tail iovec allows you to append data after the page array. Its * main interest is for appending padding to the pages in order to * satisfy the int_32-alignment requirements in RFC1832. * * For the future, we might want to string several of these together * in a list if anybody wants to make use of NFSv4 COMPOUND * operations and/or has a need for scatter/gather involving pages. */ struct xdr_buf { struct kvec head[1], /* RPC header + non-page data */ tail[1]; /* Appended after page data */ struct bio_vec *bvec; struct page ** pages; /* Array of pages */ unsigned int page_base, /* Start of page data */ page_len, /* Length of page data */ flags; /* Flags for data disposition */ #define XDRBUF_READ 0x01 /* target of file read */ #define XDRBUF_WRITE 0x02 /* source of file write */ #define XDRBUF_SPARSE_PAGES 0x04 /* Page array is sparse */ unsigned int buflen, /* Total length of storage buffer */ len; /* Length of XDR encoded message */ }; static inline void xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) { buf->head[0].iov_base = start; buf->head[0].iov_len = len; buf->tail[0].iov_len = 0; buf->pages = NULL; buf->page_len = 0; buf->flags = 0; buf->len = 0; buf->buflen = len; } /* * pre-xdr'ed macros. */ #define xdr_zero cpu_to_be32(0) #define xdr_one cpu_to_be32(1) #define xdr_two cpu_to_be32(2) #define rpc_auth_null cpu_to_be32(RPC_AUTH_NULL) #define rpc_auth_unix cpu_to_be32(RPC_AUTH_UNIX) #define rpc_auth_short cpu_to_be32(RPC_AUTH_SHORT) #define rpc_auth_gss cpu_to_be32(RPC_AUTH_GSS) #define rpc_auth_tls cpu_to_be32(RPC_AUTH_TLS) #define rpc_call cpu_to_be32(RPC_CALL) #define rpc_reply cpu_to_be32(RPC_REPLY) #define rpc_msg_accepted cpu_to_be32(RPC_MSG_ACCEPTED) #define rpc_success cpu_to_be32(RPC_SUCCESS) #define rpc_prog_unavail cpu_to_be32(RPC_PROG_UNAVAIL) #define rpc_prog_mismatch cpu_to_be32(RPC_PROG_MISMATCH) #define rpc_proc_unavail cpu_to_be32(RPC_PROC_UNAVAIL) #define rpc_garbage_args cpu_to_be32(RPC_GARBAGE_ARGS) #define rpc_system_err cpu_to_be32(RPC_SYSTEM_ERR) #define rpc_drop_reply cpu_to_be32(RPC_DROP_REPLY) #define rpc_mismatch cpu_to_be32(RPC_MISMATCH) #define rpc_auth_error cpu_to_be32(RPC_AUTH_ERROR) #define rpc_auth_ok cpu_to_be32(RPC_AUTH_OK) #define rpc_autherr_badcred cpu_to_be32(RPC_AUTH_BADCRED) #define rpc_autherr_rejectedcred cpu_to_be32(RPC_AUTH_REJECTEDCRED) #define rpc_autherr_badverf cpu_to_be32(RPC_AUTH_BADVERF) #define rpc_autherr_rejectedverf cpu_to_be32(RPC_AUTH_REJECTEDVERF) #define rpc_autherr_tooweak cpu_to_be32(RPC_AUTH_TOOWEAK) #define rpcsec_gsserr_credproblem cpu_to_be32(RPCSEC_GSS_CREDPROBLEM) #define rpcsec_gsserr_ctxproblem cpu_to_be32(RPCSEC_GSS_CTXPROBLEM) /* * Miscellaneous XDR helper functions */ __be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len); __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len); __be32 *xdr_encode_string(__be32 *p, const char *s); __be32 *xdr_decode_string_inplace(__be32 *p, char **sp, unsigned int *lenp, unsigned int maxlen); __be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *); __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *); void xdr_inline_pages(struct xdr_buf *, unsigned int, struct page **, unsigned int, unsigned int); void xdr_terminate_string(const struct xdr_buf *, const u32); size_t xdr_buf_pagecount(const struct xdr_buf *buf); int xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp); void xdr_free_bvec(struct xdr_buf *buf); unsigned int xdr_buf_to_bvec(struct bio_vec *bvec, unsigned int bvec_size, const struct xdr_buf *xdr); static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int len) { return xdr_encode_opaque(p, s, len); } /* * Decode 64bit quantities (NFSv3 support) */ static inline __be32 * xdr_encode_hyper(__be32 *p, __u64 val) { put_unaligned_be64(val, p); return p + 2; } static inline __be32 * xdr_decode_hyper(__be32 *p, __u64 *valp) { *valp = get_unaligned_be64(p); return p + 2; } static inline __be32 * xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) { memcpy(ptr, p, len); return p + XDR_QUADLEN(len); } static inline void xdr_netobj_dup(struct xdr_netobj *dst, struct xdr_netobj *src, gfp_t gfp_mask) { dst->data = kmemdup(src->data, src->len, gfp_mask); dst->len = src->len; } /* * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) */ static inline int xdr_adjust_iovec(struct kvec *iov, __be32 *p) { return iov->iov_len = ((u8 *) p - (u8 *) iov->iov_base); } /* * XDR buffer helper functions */ extern void xdr_buf_from_iov(const struct kvec *, struct xdr_buf *); extern int xdr_buf_subsegment(const struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); extern void xdr_buf_trim(struct xdr_buf *, unsigned int); extern int read_bytes_from_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int); extern int write_bytes_to_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int); extern int xdr_encode_word(const struct xdr_buf *, unsigned int, u32); extern int xdr_decode_word(const struct xdr_buf *, unsigned int, u32 *); struct xdr_array2_desc; typedef int (*xdr_xcode_elem_t)(struct xdr_array2_desc *desc, void *elem); struct xdr_array2_desc { unsigned int elem_size; unsigned int array_len; unsigned int array_maxlen; xdr_xcode_elem_t xcode; }; extern int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc); extern int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc); extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len); /* * Provide some simple tools for XDR buffer overflow-checking etc. */ struct xdr_stream { __be32 *p; /* start of available buffer */ struct xdr_buf *buf; /* XDR buffer to read/write */ __be32 *end; /* end of available buffer space */ struct kvec *iov; /* pointer to the current kvec */ struct kvec scratch; /* Scratch buffer */ struct page **page_ptr; /* pointer to the current page */ void *page_kaddr; /* kmapped address of the current page */ unsigned int nwords; /* Remaining decode buffer length */ struct rpc_rqst *rqst; /* For debugging */ }; /* * These are the xdr_stream style generic XDR encode and decode functions. */ typedef void (*kxdreproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr, const void *obj); typedef int (*kxdrdproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *obj); extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst); extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, struct page **pages, struct rpc_rqst *rqst); extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); extern int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes); extern void __xdr_commit_encode(struct xdr_stream *xdr); extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len); extern void xdr_truncate_decode(struct xdr_stream *xdr, size_t len); extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen); extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern unsigned int xdr_stream_pos(const struct xdr_stream *xdr); extern unsigned int xdr_page_pos(const struct xdr_stream *xdr); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst); extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, struct page **pages, unsigned int len); extern void xdr_finish_decode(struct xdr_stream *xdr); extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); extern void xdr_set_pagelen(struct xdr_stream *, unsigned int len); extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, unsigned int len); extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset, unsigned int target, unsigned int length); extern unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset, unsigned int length); /** * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data. * @xdr: pointer to xdr_stream struct * @buf: pointer to an empty buffer * @buflen: size of 'buf' * * The scratch buffer is used when decoding from an array of pages. * If an xdr_inline_decode() call spans across page boundaries, then * we copy the data into the scratch buffer in order to allow linear * access. */ static inline void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) { xdr->scratch.iov_base = buf; xdr->scratch.iov_len = buflen; } /** * xdr_set_scratch_page - Attach a scratch buffer for decoding data * @xdr: pointer to xdr_stream struct * @page: an anonymous page * * See xdr_set_scratch_buffer(). */ static inline void xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page) { xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE); } /** * xdr_reset_scratch_buffer - Clear scratch buffer information * @xdr: pointer to xdr_stream struct * * See xdr_set_scratch_buffer(). */ static inline void xdr_reset_scratch_buffer(struct xdr_stream *xdr) { xdr_set_scratch_buffer(xdr, NULL, 0); } /** * xdr_commit_encode - Ensure all data is written to xdr->buf * @xdr: pointer to xdr_stream * * Handle encoding across page boundaries by giving the caller a * temporary location to write to, then later copying the data into * place. __xdr_commit_encode() does that copying. */ static inline void xdr_commit_encode(struct xdr_stream *xdr) { if (unlikely(xdr->scratch.iov_len)) __xdr_commit_encode(xdr); } /** * xdr_stream_remaining - Return the number of bytes remaining in the stream * @xdr: pointer to struct xdr_stream * * Return value: * Number of bytes remaining in @xdr before xdr->end */ static inline size_t xdr_stream_remaining(const struct xdr_stream *xdr) { return xdr->nwords << 2; } ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size); ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, size_t maxlen, gfp_t gfp_flags); ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size); ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, size_t maxlen, gfp_t gfp_flags); ssize_t xdr_stream_decode_opaque_auth(struct xdr_stream *xdr, u32 *flavor, void **body, unsigned int *body_len); ssize_t xdr_stream_encode_opaque_auth(struct xdr_stream *xdr, u32 flavor, void *body, unsigned int body_len); /** * xdr_align_size - Calculate padded size of an object * @n: Size of an object being XDR encoded (in bytes) * * Return value: * Size (in bytes) of the object including xdr padding */ static inline size_t xdr_align_size(size_t n) { const size_t mask = XDR_UNIT - 1; return (n + mask) & ~mask; } /** * xdr_pad_size - Calculate size of an object's pad * @n: Size of an object being XDR encoded (in bytes) * * This implementation avoids the need for conditional * branches or modulo division. * * Return value: * Size (in bytes) of the needed XDR pad */ static inline size_t xdr_pad_size(size_t n) { return xdr_align_size(n) - n; } /** * xdr_stream_encode_item_present - Encode a "present" list item * @xdr: pointer to xdr_stream * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) { const size_t len = XDR_UNIT; __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; *p = xdr_one; return len; } /** * xdr_stream_encode_item_absent - Encode a "not present" list item * @xdr: pointer to xdr_stream * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr) { const size_t len = XDR_UNIT; __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; *p = xdr_zero; return len; } /** * xdr_encode_bool - Encode a boolean item * @p: address in a buffer into which to encode * @n: boolean value to encode * * Return value: * Address of item following the encoded boolean */ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) { *p++ = n ? xdr_one : xdr_zero; return p; } /** * xdr_stream_encode_bool - Encode a boolean item * @xdr: pointer to xdr_stream * @n: boolean value to encode * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline int xdr_stream_encode_bool(struct xdr_stream *xdr, __u32 n) { const size_t len = XDR_UNIT; __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; xdr_encode_bool(p, n); return len; } /** * xdr_stream_encode_u32 - Encode a 32-bit integer * @xdr: pointer to xdr_stream * @n: integer to encode * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_u32(struct xdr_stream *xdr, __u32 n) { const size_t len = sizeof(n); __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; *p = cpu_to_be32(n); return len; } /** * xdr_stream_encode_be32 - Encode a big-endian 32-bit integer * @xdr: pointer to xdr_stream * @n: integer to encode * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_be32(struct xdr_stream *xdr, __be32 n) { const size_t len = sizeof(n); __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; *p = n; return len; } /** * xdr_stream_encode_u64 - Encode a 64-bit integer * @xdr: pointer to xdr_stream * @n: 64-bit integer to encode * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n) { const size_t len = sizeof(n); __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; xdr_encode_hyper(p, n); return len; } /** * xdr_stream_encode_opaque_inline - Encode opaque xdr data * @xdr: pointer to xdr_stream * @ptr: pointer to void pointer * @len: size of object * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t len) { size_t count = sizeof(__u32) + xdr_align_size(len); __be32 *p = xdr_reserve_space(xdr, count); if (unlikely(!p)) { *ptr = NULL; return -EMSGSIZE; } xdr_encode_opaque(p, NULL, len); *ptr = ++p; return count; } /** * xdr_stream_encode_opaque_fixed - Encode fixed length opaque xdr data * @xdr: pointer to xdr_stream * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_opaque_fixed(struct xdr_stream *xdr, const void *ptr, size_t len) { __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; xdr_encode_opaque_fixed(p, ptr, len); return xdr_align_size(len); } /** * xdr_stream_encode_opaque - Encode variable length opaque xdr data * @xdr: pointer to xdr_stream * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len) { size_t count = sizeof(__u32) + xdr_align_size(len); __be32 *p = xdr_reserve_space(xdr, count); if (unlikely(!p)) return -EMSGSIZE; xdr_encode_opaque(p, ptr, len); return count; } /** * xdr_stream_encode_uint32_array - Encode variable length array of integers * @xdr: pointer to xdr_stream * @array: array of integers * @array_size: number of elements in @array * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_uint32_array(struct xdr_stream *xdr, const __u32 *array, size_t array_size) { ssize_t ret = (array_size+1) * sizeof(__u32); __be32 *p = xdr_reserve_space(xdr, ret); if (unlikely(!p)) return -EMSGSIZE; *p++ = cpu_to_be32(array_size); for (; array_size > 0; p++, array++, array_size--) *p = cpu_to_be32p(array); return ret; } /** * xdr_item_is_absent - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * * Return values: * %true if the following XDR item is absent * %false if the following XDR item is present */ static inline bool xdr_item_is_absent(const __be32 *p) { return *p == xdr_zero; } /** * xdr_item_is_present - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * * Return values: * %true if the following XDR item is present * %false if the following XDR item is absent */ static inline bool xdr_item_is_present(const __be32 *p) { return *p != xdr_zero; } /** * xdr_stream_decode_bool - Decode a boolean * @xdr: pointer to xdr_stream * @ptr: pointer to a u32 in which to store the result * * Return values: * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t xdr_stream_decode_bool(struct xdr_stream *xdr, __u32 *ptr) { const size_t count = sizeof(*ptr); __be32 *p = xdr_inline_decode(xdr, count); if (unlikely(!p)) return -EBADMSG; *ptr = (*p != xdr_zero); return 0; } /** * xdr_stream_decode_u32 - Decode a 32-bit integer * @xdr: pointer to xdr_stream * @ptr: location to store integer * * Return values: * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr) { const size_t count = sizeof(*ptr); __be32 *p = xdr_inline_decode(xdr, count); if (unlikely(!p)) return -EBADMSG; *ptr = be32_to_cpup(p); return 0; } /** * xdr_stream_decode_be32 - Decode a big-endian 32-bit integer * @xdr: pointer to xdr_stream * @ptr: location to store integer * * Return values: * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t xdr_stream_decode_be32(struct xdr_stream *xdr, __be32 *ptr) { const size_t count = sizeof(*ptr); __be32 *p = xdr_inline_decode(xdr, count); if (unlikely(!p)) return -EBADMSG; *ptr = *p; return 0; } /** * xdr_stream_decode_u64 - Decode a 64-bit integer * @xdr: pointer to xdr_stream * @ptr: location to store 64-bit integer * * Return values: * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) { const size_t count = sizeof(*ptr); __be32 *p = xdr_inline_decode(xdr, count); if (unlikely(!p)) return -EBADMSG; xdr_decode_hyper(p, ptr); return 0; } /** * xdr_stream_decode_opaque_fixed - Decode fixed length opaque xdr data * @xdr: pointer to xdr_stream * @ptr: location to store data * @len: size of buffer pointed to by @ptr * * Return values: * On success, returns size of object stored in @ptr * %-EBADMSG on XDR buffer overflow */ static inline ssize_t xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) { __be32 *p = xdr_inline_decode(xdr, len); if (unlikely(!p)) return -EBADMSG; xdr_decode_opaque_fixed(p, ptr, len); return len; } /** * xdr_stream_decode_opaque_inline - Decode variable length opaque xdr data * @xdr: pointer to xdr_stream * @ptr: location to store pointer to opaque data * @maxlen: maximum acceptable object size * * Note: the pointer stored in @ptr cannot be assumed valid after the XDR * buffer has been destroyed, or even after calling xdr_inline_decode() * on @xdr. It is therefore expected that the object it points to should * be processed immediately. * * Return values: * On success, returns size of object stored in *@ptr * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the object would exceed @maxlen */ static inline ssize_t xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxlen) { __be32 *p; __u32 len; *ptr = NULL; if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0)) return -EBADMSG; if (len != 0) { p = xdr_inline_decode(xdr, len); if (unlikely(!p)) return -EBADMSG; if (unlikely(len > maxlen)) return -EMSGSIZE; *ptr = p; } return len; } /** * xdr_stream_decode_uint32_array - Decode variable length array of integers * @xdr: pointer to xdr_stream * @array: location to store the integer array or NULL * @array_size: number of elements to store * * Return values: * On success, returns number of elements stored in @array * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the array exceeds @array_size */ static inline ssize_t xdr_stream_decode_uint32_array(struct xdr_stream *xdr, __u32 *array, size_t array_size) { __be32 *p; __u32 len; ssize_t retval; if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0)) return -EBADMSG; if (U32_MAX >= SIZE_MAX / sizeof(*p) && len > SIZE_MAX / sizeof(*p)) return -EBADMSG; p = xdr_inline_decode(xdr, len * sizeof(*p)); if (unlikely(!p)) return -EBADMSG; if (array == NULL) return len; if (len <= array_size) { if (len < array_size) memset(array+len, 0, (array_size-len)*sizeof(*array)); array_size = len; retval = len; } else retval = -EMSGSIZE; for (; array_size > 0; p++, array++, array_size--) *array = be32_to_cpup(p); return retval; } #endif /* _SUNRPC_XDR_H_ */
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 /* SPDX-License-Identifier: GPL-2.0 */ /* * Mutexes: blocking mutual exclusion locks * * started by Ingo Molnar: * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * This file contains the main data structure and API definitions. */ #ifndef __LINUX_MUTEX_H #define __LINUX_MUTEX_H #include <asm/current.h> #include <linux/list.h> #include <linux/spinlock_types.h> #include <linux/lockdep.h> #include <linux/atomic.h> #include <asm/processor.h> #include <linux/osq_lock.h> #include <linux/debug_locks.h> #include <linux/cleanup.h> #include <linux/mutex_types.h> struct device; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ , .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ } #else # define __DEP_MAP_MUTEX_INITIALIZER(lockname) #endif #ifdef CONFIG_DEBUG_MUTEXES # define __DEBUG_MUTEX_INITIALIZER(lockname) \ , .magic = &lockname extern void mutex_destroy(struct mutex *lock); #else # define __DEBUG_MUTEX_INITIALIZER(lockname) static inline void mutex_destroy(struct mutex *lock) {} #endif /** * mutex_init - initialize the mutex * @mutex: the mutex to be initialized * * Initialize the mutex to unlocked state. * * It is not allowed to initialize an already locked mutex. */ #define mutex_init(mutex) \ do { \ static struct lock_class_key __key; \ \ __mutex_init((mutex), #mutex, &__key); \ } while (0) /** * mutex_init_with_key - initialize a mutex with a given lockdep key * @mutex: the mutex to be initialized * @key: the lockdep key to be associated with the mutex * * Initialize the mutex to the unlocked state. * * It is not allowed to initialize an already locked mutex. */ #define mutex_init_with_key(mutex, key) __mutex_init((mutex), #mutex, (key)) #ifndef CONFIG_PREEMPT_RT #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ __DEBUG_MUTEX_INITIALIZER(lockname) \ __DEP_MAP_MUTEX_INITIALIZER(lockname) } #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) extern void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); /** * mutex_is_locked - is the mutex locked * @lock: the mutex to be queried * * Returns true if the mutex is locked, false if unlocked. */ extern bool mutex_is_locked(struct mutex *lock); #else /* !CONFIG_PREEMPT_RT */ /* * Preempt-RT variant based on rtmutexes. */ #define __MUTEX_INITIALIZER(mutexname) \ { \ .rtmutex = __RT_MUTEX_BASE_INITIALIZER(mutexname.rtmutex) \ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ } #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) extern void __mutex_rt_init(struct mutex *lock, const char *name, struct lock_class_key *key); #define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex) #define __mutex_init(mutex, name, key) \ do { \ rt_mutex_base_init(&(mutex)->rtmutex); \ __mutex_rt_init((mutex), name, key); \ } while (0) #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES int __devm_mutex_init(struct device *dev, struct mutex *lock); #else static inline int __devm_mutex_init(struct device *dev, struct mutex *lock) { /* * When CONFIG_DEBUG_MUTEXES is off mutex_destroy() is just a nop so * no really need to register it in the devm subsystem. */ return 0; } #endif #define devm_mutex_init(dev, mutex) \ ({ \ typeof(mutex) mutex_ = (mutex); \ \ mutex_init(mutex_); \ __devm_mutex_init(dev, mutex_); \ }) /* * See kernel/locking/mutex.c for detailed documentation of these APIs. * Also see Documentation/locking/mutex-design.rst. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass); extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); extern int __must_check mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass); extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass); #define mutex_lock(lock) mutex_lock_nested(lock, 0) #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) #define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0) #define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0) #define mutex_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #else extern void mutex_lock(struct mutex *lock); extern int __must_check mutex_lock_interruptible(struct mutex *lock); extern int __must_check mutex_lock_killable(struct mutex *lock); extern void mutex_lock_io(struct mutex *lock); # define mutex_lock_nested(lock, subclass) mutex_lock(lock) # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) # define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock) #endif /* * NOTE: mutex_trylock() follows the spin_trylock() convention, * not the down_trylock() convention! * * Returns 1 if the mutex has been acquired successfully, and 0 on contention. */ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T)) DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T)) DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0) extern unsigned long mutex_get_owner(struct mutex *lock); #endif /* __LINUX_MUTEX_H */
11 794 11 11 11 11 11 11 10 10 9 9 11 11 9 11 93 93 93 93 93 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/fib_notifier.h> static unsigned int fib_notifier_net_id; struct fib_notifier_net { struct list_head fib_notifier_ops; struct atomic_notifier_head fib_chain; }; int call_fib_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { int err; err = nb->notifier_call(nb, event_type, info); return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); int err; err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info); return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifiers); static unsigned int fib_seq_sum(struct net *net) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; unsigned int fib_seq = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) continue; fib_seq += ops->fib_seq_read(net); module_put(ops->owner); } rcu_read_unlock(); return fib_seq; } static int fib_net_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) continue; err = ops->fib_dump(net, nb, extack); module_put(ops->owner); if (err) goto unlock; } unlock: rcu_read_unlock(); return err; } static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb), unsigned int fib_seq) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); atomic_notifier_chain_register(&fn_net->fib_chain, nb); if (fib_seq == fib_seq_sum(net)) return true; atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); if (cb) cb(nb); return false; } #define FIB_DUMP_MAX_RETRIES 5 int register_fib_notifier(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb), struct netlink_ext_ack *extack) { int retries = 0; int err; do { unsigned int fib_seq = fib_seq_sum(net); err = fib_net_dump(net, nb, extack); if (err) return err; if (fib_dump_is_consistent(net, nb, cb, fib_seq)) return 0; } while (++retries < FIB_DUMP_MAX_RETRIES); return -EBUSY; } EXPORT_SYMBOL(register_fib_notifier); int unregister_fib_notifier(struct net *net, struct notifier_block *nb) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); static int __fib_notifier_ops_register(struct fib_notifier_ops *ops, struct net *net) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *o; list_for_each_entry(o, &fn_net->fib_notifier_ops, list) if (ops->family == o->family) return -EEXIST; list_add_tail_rcu(&ops->list, &fn_net->fib_notifier_ops); return 0; } struct fib_notifier_ops * fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net) { struct fib_notifier_ops *ops; int err; ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (!ops) return ERR_PTR(-ENOMEM); err = __fib_notifier_ops_register(ops, net); if (err) goto err_register; return ops; err_register: kfree(ops); return ERR_PTR(err); } EXPORT_SYMBOL(fib_notifier_ops_register); void fib_notifier_ops_unregister(struct fib_notifier_ops *ops) { list_del_rcu(&ops->list); kfree_rcu(ops, rcu); } EXPORT_SYMBOL(fib_notifier_ops_unregister); static int __net_init fib_notifier_net_init(struct net *net) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); INIT_LIST_HEAD(&fn_net->fib_notifier_ops); ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain); return 0; } static void __net_exit fib_notifier_net_exit(struct net *net) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); WARN_ON_ONCE(!list_empty(&fn_net->fib_notifier_ops)); } static struct pernet_operations fib_notifier_net_ops = { .init = fib_notifier_net_init, .exit = fib_notifier_net_exit, .id = &fib_notifier_net_id, .size = sizeof(struct fib_notifier_net), }; static int __init fib_notifier_init(void) { return register_pernet_subsys(&fib_notifier_net_ops); } subsys_initcall(fib_notifier_init);
19 11 60 3 3 3 2 3 2 3 21 56 27 45 46 89 95 24 21 13 1 1 2 2 1 1 1 1 1 1 2 2 15 15 14 15 15 61 62 67 58 23 23 15 15 12 3 15 15 14 16 16 16 16 16 16 16 16 3 3 3 3 3 3 22 21 23 21 21 21 3 3 3 58 58 7 58 57 51 3 3 3 16 16 2 1 1 93 92 23 23 21 21 21 3 3 3 3 3 16 16 16 16 15 15 15 15 23 23 21 21 15 3 3 3 3 2 3 2 2 2 2 2 1 1 1 2 3 3 1 1 2 2 2 1 1 6 4 3 3 3 1 18 18 18 2 2 5 5 55 10 55 55 55 55 55 26 67 67 55 26 22 23 23 23 23 23 23 23 22 15 15 14 15 15 14 730 732 730 732 24 24 24 2197 2193 2196 300 24 40 39 39 2192 21 21 16 5 5 20 4 3 3 3 2 5 5 5 1 1 1 22 23 21 22 23 23 23 23 23 23 22 23 23 23 23 23 22 23 22 23 22 23 23 23 23 23 23 23 23 22 23 23 23 23 23 22 23 23 23 23 22 23 22 23 23 23 23 23 22 23 23 23 23 23 22 22 23 23 23 23 23 23 23 23 22 23 21 23 23 23 23 23 23 23 23 23 23 23 23 21 23 23 23 23 22 23 23 22 23 23 23 21 23 22 23 23 23 23 22 23 22 22 23 23 23 22 23 23 23 23 23 23 23 22 23 23 23 23 23 23 23 23 23 23 23 23 22 23 22 23 23 23 23 23 23 22 23 1 1 21 21 7 18 1010 969 21 20 2176 2154 28 3 1011 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); static struct workqueue_struct *ib_unreg_wq; /* * Each of the three rwsem locks (devices, clients, client_data) protects the * xarray of the same name. Specifically it allows the caller to assert that * the MARK will/will not be changing under the lock, and for devices and * clients, that the value in the xarray is still a valid pointer. Change of * the MARK is linked to the object state, so holding the lock and testing the * MARK also asserts that the contained object is in a certain state. * * This is used to build a two stage register/unregister flow where objects * can continue to be in the xarray even though they are still in progress to * register/unregister. * * The xarray itself provides additional locking, and restartable iteration, * which is also relied on. * * Locks should not be nested, with the exception of client_data, which is * allowed to nest under the read side of the other two locks. * * The devices_rwsem also protects the device name list, any change or * assignment of device name must also hold the write side to guarantee unique * names. */ /* * devices contains devices that have had their names assigned. The * devices may not be registered. Users that care about the registration * status need to call ib_device_try_get() on the device to ensure it is * registered, and keep it registered, for the required duration. * */ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 static u32 highest_client_id; #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); static DECLARE_RWSEM(clients_rwsem); static void ib_client_put(struct ib_client *client) { if (refcount_dec_and_test(&client->uses)) complete(&client->uses_zero); } /* * If client_data is registered then the corresponding client must also still * be registered. */ #define CLIENT_DATA_REGISTERED XA_MARK_1 unsigned int rdma_dev_net_id; /* * A list of net namespaces is maintained in an xarray. This is necessary * because we can't get the locking right using the existing net ns list. We * would require a init_net callback after the list is updated. */ static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); /* * rwsem to protect accessing the rdma_nets xarray entries. */ static DECLARE_RWSEM(rdma_nets_rwsem); bool ib_devices_shared_netns = true; module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); MODULE_PARM_DESC(netns_mode, "Share device among net namespaces; default=1 (shared)"); /** * rdma_dev_access_netns() - Return whether an rdma device can be accessed * from a specified net namespace or not. * @dev: Pointer to rdma device which needs to be checked * @net: Pointer to net namesapce for which access to be checked * * When the rdma device is in shared mode, it ignores the net namespace. * When the rdma device is exclusive to a net namespace, rdma device net * namespace is checked against the specified one. */ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) { return (ib_devices_shared_netns || net_eq(read_pnet(&dev->coredev.rdma_net), net)); } EXPORT_SYMBOL(rdma_dev_access_netns); /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in * the array. This does the same thing as xa_for_each except that it also * returns NULL valued entries if the array is allocating. Simplified to only * work on simple xarrays. */ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { entry = xas_find_marked(&xas, ULONG_MAX, filter); if (xa_is_zero(entry)) break; } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) { *indexp = xas.xa_index; if (xa_is_zero(entry)) return NULL; return entry; } return XA_ERROR(-ENOENT); } #define xan_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) /* RCU hash table mapping netdevice pointers to struct ib_port_data */ static DEFINE_SPINLOCK(ndev_hash_lock); static DECLARE_HASHTABLE(ndev_hash, 5); static void free_netdevs(struct ib_device *ib_dev); static void ib_unregister_work(struct work_struct *work); static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); static void __ibdev_printk(const char *level, const struct ib_device *ibdev, struct va_format *vaf) { if (ibdev && ibdev->dev.parent) dev_printk_emit(level[1] - '0', ibdev->dev.parent, "%s %s %s: %pV", dev_driver_string(ibdev->dev.parent), dev_name(ibdev->dev.parent), dev_name(&ibdev->dev), vaf); else if (ibdev) printk("%s%s: %pV", level, dev_name(&ibdev->dev), vaf); else printk("%s(NULL ib_device): %pV", level, vaf); } #define define_ibdev_printk_level(func, level) \ void func(const struct ib_device *ibdev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __ibdev_printk(level, ibdev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); define_ibdev_printk_level(ibdev_alert, KERN_ALERT); define_ibdev_printk_level(ibdev_crit, KERN_CRIT); define_ibdev_printk_level(ibdev_err, KERN_ERR); define_ibdev_printk_level(ibdev_warn, KERN_WARNING); define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); define_ibdev_printk_level(ibdev_info, KERN_INFO); static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net); /* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { struct rcu_head rcu_head; struct ib_port_data pdata[]; }; static void ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), IB_MANDATORY_FUNC(reg_user_mr), IB_MANDATORY_FUNC(dereg_mr), IB_MANDATORY_FUNC(get_port_immutable) }; int i; device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { device->kverbs_provider = false; break; } } } /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. */ struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) { struct ib_device *device; down_read(&devices_rwsem); device = xa_load(&devices, index); if (device) { if (!rdma_dev_access_netns(device, net)) { device = NULL; goto out; } if (!ib_device_try_get(device)) device = NULL; } out: up_read(&devices_rwsem); return device; } /** * ib_device_put - Release IB device reference * @device: device whose reference to be released * * ib_device_put() releases reference to the IB device to allow it to be * unregistered and eventually free. */ void ib_device_put(struct ib_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->unreg_completion); } EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; unsigned long index; xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } /** * ib_device_get_by_name - Find an IB device by name * @name: The name to look for * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device by its name. The caller must call * ib_device_put() on the returned pointer. */ struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id) { struct ib_device *device; down_read(&devices_rwsem); device = __ib_device_get_by_name(name); if (device && driver_id != RDMA_DRIVER_UNKNOWN && device->ops.driver_id != driver_id) device = NULL; if (device) { if (!ib_device_try_get(device)) device = NULL; } up_read(&devices_rwsem); return device; } EXPORT_SYMBOL(ib_device_get_by_name); static int rename_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; int ret = 0; mutex_lock(&device->compat_devs_mutex); xa_for_each (&device->compat_devs, index, cdev) { ret = device_rename(&cdev->dev, dev_name(&device->dev)); if (ret) { dev_warn(&cdev->dev, "Fail to rename compatdev to new name %s\n", dev_name(&device->dev)); break; } } mutex_unlock(&device->compat_devs_mutex); return ret; } int ib_device_rename(struct ib_device *ibdev, const char *name) { unsigned long index; void *client_data; int ret; down_write(&devices_rwsem); if (!strcmp(name, dev_name(&ibdev->dev))) { up_write(&devices_rwsem); return 0; } if (__ib_device_get_by_name(name)) { up_write(&devices_rwsem); return -EEXIST; } ret = device_rename(&ibdev->dev, name); if (ret) { up_write(&devices_rwsem); return ret; } strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); ret = rename_compat_devs(ibdev); downgrade_write(&devices_rwsem); down_read(&ibdev->client_data_rwsem); xan_for_each_marked(&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->rename) continue; client->rename(ibdev, client_data); } up_read(&ibdev->client_data_rwsem); rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); up_read(&devices_rwsem); return 0; } int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) { if (use_dim > 1) return -EINVAL; ibdev->use_cq_dim = use_dim; return 0; } static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; unsigned long index; struct ida inuse; int rc; int i; lockdep_assert_held_write(&devices_rwsem); ida_init(&inuse); xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); if (strcmp(buf, dev_name(&device->dev)) != 0) continue; rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); if (rc < 0) goto out; } rc = ida_alloc(&inuse, GFP_KERNEL); if (rc < 0) goto out; rc = dev_set_name(&ibdev->dev, name, rc); out: ida_destroy(&inuse); return rc; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); free_netdevs(dev); WARN_ON(refcount_read(&dev->refcount)); if (dev->hw_stats_data) ib_device_release_hw_stats(dev->hw_stats_data); if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); rdma_counter_release(dev); kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, pdata[0]), rcu_head); } mutex_destroy(&dev->subdev_lock); mutex_destroy(&dev->unregistration_lock); mutex_destroy(&dev->compat_devs_mutex); xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); } static int ib_device_uevent(const struct device *device, struct kobj_uevent_env *env) { if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* * It would be nice to pass the node GUID with the event... */ return 0; } static const void *net_namespace(const struct device *d) { const struct ib_core_device *coredev = container_of(d, struct ib_core_device, dev); return read_pnet(&coredev->rdma_net); } static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, .dev_uevent = ib_device_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, }; static void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, struct net *net) { bool is_full_dev = &dev->coredev == coredev; /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. * dev must be the first element as ib_core and providers * driver uses it. Adding anything in ib_core_device before * device will break this assumption. */ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != offsetof(struct ib_device, dev)); coredev->dev.class = &ib_class; coredev->dev.groups = dev->groups; /* * Don't expose hw counters outside of the init namespace. */ if (!is_full_dev && dev->hw_stats_attr_index) coredev->dev.groups[dev->hw_stats_attr_index] = NULL; device_initialize(&coredev->dev); coredev->owner = dev; INIT_LIST_HEAD(&coredev->port_list); write_pnet(&coredev->rdma_net, net); } /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; unsigned int i; if (WARN_ON(size < sizeof(struct ib_device))) return NULL; device = kzalloc(size, GFP_KERNEL); if (!device) return NULL; if (rdma_restrack_init(device)) { kfree(device); return NULL; } rdma_init_coredev(&device->coredev, device, &init_net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->qp_open_list_lock); init_rwsem(&device->event_handler_rwsem); mutex_init(&device->unregistration_lock); /* * client_data needs to be alloc because we don't want our mark to be * destroyed if the user stores NULL in the client data. */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); init_rwsem(&device->client_data_rwsem); xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); mutex_init(&device->compat_devs_mutex); init_completion(&device->unreg_completion); INIT_WORK(&device->unregistration_work, ib_unregister_work); spin_lock_init(&device->cq_pools_lock); for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) INIT_LIST_HEAD(&device->cq_pools[i]); rwlock_init(&device->cache_lock); device->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); mutex_init(&device->subdev_lock); INIT_LIST_HEAD(&device->subdev_list_head); INIT_LIST_HEAD(&device->subdev_list); return device; } EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { if (device->ops.dealloc_driver) device->ops.dealloc_driver(device); /* * ib_unregister_driver() requires all devices to remain in the xarray * while their ops are callable. The last op we call is dealloc_driver * above. This is needed to create a fence on op callbacks prior to * allowing the driver module to unload. */ down_write(&devices_rwsem); if (xa_load(&devices, device->index) == device) xa_erase(&devices, device->index); up_write(&devices_rwsem); /* Expedite releasing netdev references */ free_netdevs(device); WARN_ON(!xa_empty(&device->compat_devs)); WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); /* Balances with device_initialize */ put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); /* * add_client_context() and remove_client_context() must be safe against * parallel calls on the same device - registration/unregistration of both the * device and client can be occurring in parallel. * * The routines need to be a fence, any caller must not return until the add * or remove is fully completed. */ static int add_client_context(struct ib_device *device, struct ib_client *client) { int ret = 0; if (!device->kverbs_provider && !client->no_kverbs_req) return 0; down_write(&device->client_data_rwsem); /* * So long as the client is registered hold both the client and device * unregistration locks. */ if (!refcount_inc_not_zero(&client->uses)) goto out_unlock; refcount_inc(&device->refcount); /* * Another caller to add_client_context got here first and has already * completely initialized context. */ if (xa_get_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED)) goto out; ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, GFP_KERNEL)); if (ret) goto out; downgrade_write(&device->client_data_rwsem); if (client->add) { if (client->add(device)) { /* * If a client fails to add then the error code is * ignored, but we won't call any more ops on this * client. */ xa_erase(&device->client_data, client->client_id); up_read(&device->client_data_rwsem); ib_device_put(device); ib_client_put(client); return 0; } } /* Readers shall not see a client until add has been completed */ xa_set_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED); up_read(&device->client_data_rwsem); return 0; out: ib_device_put(device); ib_client_put(client); out_unlock: up_write(&device->client_data_rwsem); return ret; } static void remove_client_context(struct ib_device *device, unsigned int client_id) { struct ib_client *client; void *client_data; down_write(&device->client_data_rwsem); if (!xa_get_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED)) { up_write(&device->client_data_rwsem); return; } client_data = xa_load(&device->client_data, client_id); xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); client = xa_load(&clients, client_id); up_write(&device->client_data_rwsem); /* * Notice we cannot be holding any exclusive locks when calling the * remove callback as the remove callback can recurse back into any * public functions in this module and thus try for any locks those * functions take. * * For this reason clients and drivers should not call the * unregistration functions will holdling any locks. */ if (client->remove) client->remove(device, client_data); xa_erase(&device->client_data, client_id); ib_device_put(device); ib_client_put(client); } static int alloc_port_data(struct ib_device *device) { struct ib_port_data_rcu *pdata_rcu; u32 port; if (device->port_data) return 0; /* This can only be called once the physical port range is defined */ if (WARN_ON(!device->phys_port_cnt)) return -EINVAL; /* Reserve U32_MAX so the logic to go over all the ports is sane */ if (WARN_ON(device->phys_port_cnt == U32_MAX)) return -EINVAL; /* * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. * * Therefore port_data is declared as a 1 based array with potential * empty slots at the beginning. */ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, size_add(rdma_end_port(device), 1)), GFP_KERNEL); if (!pdata_rcu) return -ENOMEM; /* * The rcu_head is put in front of the port data array and the stored * pointer is adjusted since we never need to see that member until * kfree_rcu. */ device->port_data = pdata_rcu->pdata; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; pdata->ib_dev = device; spin_lock_init(&pdata->pkey_list_lock); INIT_LIST_HEAD(&pdata->pkey_list); spin_lock_init(&pdata->netdev_lock); INIT_HLIST_NODE(&pdata->ndev_hash_link); } return 0; } static int verify_immutable(const struct ib_device *dev, u32 port) { return WARN_ON(!rdma_cap_ib_mad(dev, port) && rdma_max_mad_size(dev, port) != 0); } static int setup_port_data(struct ib_device *device) { u32 port; int ret; ret = alloc_port_data(device); if (ret) return ret; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; ret = device->ops.get_port_immutable(device, port, &pdata->immutable); if (ret) return ret; if (verify_immutable(device, port)) return -EINVAL; } return 0; } /** * ib_port_immutable_read() - Read rdma port's immutable data * @dev: IB device * @port: port number whose immutable data to read. It starts with index 1 and * valid upto including rdma_end_port(). */ const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port) { WARN_ON(!rdma_is_port_valid(dev, port)); return &dev->port_data[port].immutable; } EXPORT_SYMBOL(ib_port_immutable_read); void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->ops.get_dev_fw_str) dev->ops.get_dev_fw_str(dev, str); else str[0] = '\0'; } EXPORT_SYMBOL(ib_get_device_fw_str); static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned int i; rdma_for_each_port (dev, i) { u64 sp; ib_get_cached_subnet_prefix(dev, i, &sp); ib_security_cache_change(dev, i, sp); } } up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data) { if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; schedule_work(&ib_policy_change_work); ib_mad_agent_security_change(); return NOTIFY_OK; } static void compatdev_release(struct device *dev) { struct ib_core_device *cdev = container_of(dev, struct ib_core_device, dev); kfree(cdev); } static int add_one_compat_dev(struct ib_device *device, struct rdma_dev_net *rnet) { struct ib_core_device *cdev; int ret; lockdep_assert_held(&rdma_nets_rwsem); if (!ib_devices_shared_netns) return 0; /* * Create and add compat device in all namespaces other than where it * is currently bound to. */ if (net_eq(read_pnet(&rnet->net), read_pnet(&device->coredev.rdma_net))) return 0; /* * The first of init_net() or ib_register_device() to take the * compat_devs_mutex wins and gets to add the device. Others will wait * for completion here. */ mutex_lock(&device->compat_devs_mutex); cdev = xa_load(&device->compat_devs, rnet->id); if (cdev) { ret = 0; goto done; } ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); if (ret) goto done; cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) { ret = -ENOMEM; goto cdev_err; } cdev->dev.parent = device->dev.parent; rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); cdev->dev.release = compatdev_release; ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); if (ret) goto add_err; ret = device_add(&cdev->dev); if (ret) goto add_err; ret = ib_setup_port_attrs(cdev); if (ret) goto port_err; ret = xa_err(xa_store(&device->compat_devs, rnet->id, cdev, GFP_KERNEL)); if (ret) goto insert_err; mutex_unlock(&device->compat_devs_mutex); return 0; insert_err: ib_free_port_attrs(cdev); port_err: device_del(&cdev->dev); add_err: put_device(&cdev->dev); cdev_err: xa_release(&device->compat_devs, rnet->id); done: mutex_unlock(&device->compat_devs_mutex); return ret; } static void remove_one_compat_dev(struct ib_device *device, u32 id) { struct ib_core_device *cdev; mutex_lock(&device->compat_devs_mutex); cdev = xa_erase(&device->compat_devs, id); mutex_unlock(&device->compat_devs_mutex); if (cdev) { ib_free_port_attrs(cdev); device_del(&cdev->dev); put_device(&cdev->dev); } } static void remove_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; xa_for_each (&device->compat_devs, index, cdev) remove_one_compat_dev(device, index); } static int add_compat_devs(struct ib_device *device) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; lockdep_assert_held(&devices_rwsem); down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, index, rnet) { ret = add_one_compat_dev(device, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); return ret; } static void remove_all_compat_devs(void) { struct ib_compat_device *cdev; struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { unsigned long c_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&dev->compat_devs, c_index, cdev) remove_one_compat_dev(dev, c_index); up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); } static int add_all_compat_devs(void) { struct rdma_dev_net *rnet; struct ib_device *dev; unsigned long index; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned long net_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, net_index, rnet) { ret = add_one_compat_dev(dev, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); if (ret) remove_all_compat_devs(); return ret; } int rdma_compatdev_set(u8 enable) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; down_write(&rdma_nets_rwsem); if (ib_devices_shared_netns == enable) { up_write(&rdma_nets_rwsem); return 0; } /* enable/disable of compat devices is not supported * when more than default init_net exists. */ xa_for_each (&rdma_nets, index, rnet) { ret++; break; } if (!ret) ib_devices_shared_netns = enable; up_write(&rdma_nets_rwsem); if (ret) return -EBUSY; if (enable) ret = add_all_compat_devs(); else remove_all_compat_devs(); return ret; } static void rdma_dev_exit_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); struct ib_device *dev; unsigned long index; int ret; down_write(&rdma_nets_rwsem); /* * Prevent the ID from being re-used and hide the id from xa_for_each. */ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); WARN_ON(ret); up_write(&rdma_nets_rwsem); down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { get_device(&dev->dev); /* * Release the devices_rwsem so that pontentially blocking * device_del, doesn't hold the devices_rwsem for too long. */ up_read(&devices_rwsem); remove_one_compat_dev(dev, rnet->id); /* * If the real device is in the NS then move it back to init. */ rdma_dev_change_netns(dev, net, &init_net); put_device(&dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); rdma_nl_net_exit(rnet); xa_erase(&rdma_nets, rnet->id); } static __net_init int rdma_dev_init_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); unsigned long index; struct ib_device *dev; int ret; write_pnet(&rnet->net, net); ret = rdma_nl_net_init(rnet); if (ret) return ret; /* No need to create any compat devices in default init_net. */ if (net_eq(net, &init_net)) return 0; ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); if (ret) { rdma_nl_net_exit(rnet); return ret; } down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { /* Hold nets_rwsem so that netlink command cannot change * system configuration for device sharing mode. */ down_read(&rdma_nets_rwsem); ret = add_one_compat_dev(dev, rnet); up_read(&rdma_nets_rwsem); if (ret) break; } up_read(&devices_rwsem); if (ret) rdma_dev_exit_net(net); return ret; } /* * Assign the unique string device name and the unique device index. This is * undone by ib_dealloc_device. */ static int assign_name(struct ib_device *device, const char *name) { static u32 last_id; int ret; down_write(&devices_rwsem); /* Assign a unique name to the device */ if (strchr(name, '%')) ret = alloc_name(device, name); else ret = dev_set_name(&device->dev, name); if (ret) goto out; if (__ib_device_get_by_name(dev_name(&device->dev))) { ret = -ENFILE; goto out; } strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, &last_id, GFP_KERNEL); if (ret > 0) ret = 0; out: up_write(&devices_rwsem); return ret; } /* * setup_device() allocates memory and sets up data that requires calling the * device ops, this is the only reason these actions are not done during * ib_alloc_device. It is undone by ib_dealloc_device(). */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; ib_device_check_mandatory(device); ret = setup_port_data(device); if (ret) { dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->ops.query_device(device, &device->attrs, &uhw); if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); return ret; } return 0; } static void disable_device(struct ib_device *device) { u32 cid; WARN_ON(!refcount_read(&device->refcount)); down_write(&devices_rwsem); xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); up_write(&devices_rwsem); /* * Remove clients in LIFO order, see assign_client_id. This could be * more efficient if xarray learns to reverse iterate. Since no new * clients can be added to this ib_device past this point we only need * the maximum possible client_id value here. */ down_read(&clients_rwsem); cid = highest_client_id; up_read(&clients_rwsem); while (cid) { cid--; remove_client_context(device, cid); } ib_cq_pool_cleanup(device); /* Pairs with refcount_set in enable_device */ ib_device_put(device); wait_for_completion(&device->unreg_completion); /* * compat devices must be removed after device refcount drops to zero. * Otherwise init_net() may add more compatdevs after removing compat * devices and before device is disabled. */ remove_compat_devs(device); } /* * An enabled device is visible to all clients and to all the public facing * APIs that return a device pointer. This always returns with a new get, even * if it fails. */ static int enable_device_and_get(struct ib_device *device) { struct ib_client *client; unsigned long index; int ret = 0; /* * One ref belongs to the xa and the other belongs to this * thread. This is needed to guard against parallel unregistration. */ refcount_set(&device->refcount, 2); down_write(&devices_rwsem); xa_set_mark(&devices, device->index, DEVICE_REGISTERED); /* * By using downgrade_write() we ensure that no other thread can clear * DEVICE_REGISTERED while we are completing the client setup. */ downgrade_write(&devices_rwsem); if (device->ops.enable_driver) { ret = device->ops.enable_driver(device); if (ret) goto out; } down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { ret = add_client_context(device, client); if (ret) break; } up_read(&clients_rwsem); if (!ret) ret = add_compat_devs(device); out: up_read(&devices_rwsem); return ret; } static void prevent_dealloc_device(struct ib_device *ib_dev) { } static void ib_device_notify_register(struct ib_device *device) { struct net_device *netdev; u32 port; int ret; down_read(&devices_rwsem); ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) goto out; rdma_for_each_port(device, port) { netdev = ib_device_get_netdev(device, port); if (!netdev) continue; ret = rdma_nl_notify_event(device, port, RDMA_NETDEV_ATTACH_EVENT); dev_put(netdev); if (ret) goto out; } out: up_read(&devices_rwsem); } /** * ib_register_device - Register an IB device with IB core * @device: Device to register * @name: unique string device name. This may include a '%' which will * cause a unique index to be added to the passed device name. * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB * device will be used. In this case the caller should fully * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). * * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() * asynchronously then the device pointer may become freed as soon as this * function returns. */ int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device) { int ret; ret = assign_name(device, name); if (ret) return ret; /* * If the caller does not provide a DMA capable device then the IB core * will set up ib_sge and scatterlist structures that stash the kernel * virtual address into the address field. */ WARN_ON(dma_device && !dma_device->dma_parms); device->dma_device = dma_device; ret = setup_device(device); if (ret) return ret; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); return ret; } device->groups[0] = &ib_dev_attr_group; device->groups[1] = device->ops.device_group; ret = ib_setup_device_attrs(device); if (ret) goto cache_cleanup; ib_device_register_rdmacg(device); rdma_counter_init(device); /* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet. */ dev_set_uevent_suppress(&device->dev, true); ret = device_add(&device->dev); if (ret) goto cg_cleanup; ret = ib_setup_port_attrs(&device->coredev); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); goto dev_cleanup; } ret = enable_device_and_get(device); if (ret) { void (*dealloc_fn)(struct ib_device *); /* * If we hit this error flow then we don't want to * automatically dealloc the device since the caller is * expected to call ib_dealloc_device() after * ib_register_device() fails. This is tricky due to the * possibility for a parallel unregistration along with this * error flow. Since we have a refcount here we know any * parallel flow is stopped in disable_device and will see the * special dealloc_driver pointer, causing the responsibility to * ib_dealloc_device() to revert back to this thread. */ dealloc_fn = device->ops.dealloc_driver; device->ops.dealloc_driver = prevent_dealloc_device; ib_device_put(device); __ib_unregister_device(device); device->ops.dealloc_driver = dealloc_fn; dev_set_uevent_suppress(&device->dev, false); return ret; } dev_set_uevent_suppress(&device->dev, false); /* Mark for userspace that device is ready */ kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_notify_register(device); ib_device_put(device); return 0; dev_cleanup: device_del(&device->dev); cg_cleanup: dev_set_uevent_suppress(&device->dev, false); ib_device_unregister_rdmacg(device); cache_cleanup: ib_cache_cleanup_one(device); return ret; } EXPORT_SYMBOL(ib_register_device); /* Callers must hold a get on the device. */ static void __ib_unregister_device(struct ib_device *ib_dev) { struct ib_device *sub, *tmp; mutex_lock(&ib_dev->subdev_lock); list_for_each_entry_safe_reverse(sub, tmp, &ib_dev->subdev_list_head, subdev_list) { list_del(&sub->subdev_list); ib_dev->ops.del_sub_dev(sub); ib_device_put(ib_dev); } mutex_unlock(&ib_dev->subdev_lock); /* * We have a registration lock so that all the calls to unregister are * fully fenced, once any unregister returns the device is truely * unregistered even if multiple callers are unregistering it at the * same time. This also interacts with the registration flow and * provides sane semantics if register and unregister are racing. */ mutex_lock(&ib_dev->unregistration_lock); if (!refcount_read(&ib_dev->refcount)) goto out; disable_device(ib_dev); rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); ib_free_port_attrs(&ib_dev->coredev); device_del(&ib_dev->dev); ib_device_unregister_rdmacg(ib_dev); ib_cache_cleanup_one(ib_dev); /* * Drivers using the new flow may not call ib_dealloc_device except * in error unwind prior to registration success. */ if (ib_dev->ops.dealloc_driver && ib_dev->ops.dealloc_driver != prevent_dealloc_device) { WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); ib_dealloc_device(ib_dev); } out: mutex_unlock(&ib_dev->unregistration_lock); } /** * ib_unregister_device - Unregister an IB device * @ib_dev: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. * * Callers should call this routine only once, and protect against races with * registration. Typically it should only be called as part of a remove * callback in an implementation of driver core's struct device_driver and * related. * * If ops.dealloc_driver is used then ib_dev will be freed upon return from * this function. */ void ib_unregister_device(struct ib_device *ib_dev) { get_device(&ib_dev->dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device); /** * ib_unregister_device_and_put - Unregister a device while holding a 'get' * @ib_dev: The device to unregister * * This is the same as ib_unregister_device(), except it includes an internal * ib_device_put() that should match a 'get' obtained by the caller. * * It is safe to call this routine concurrently from multiple threads while * holding the 'get'. When the function returns the device is fully * unregistered. * * Drivers using this flow MUST use the driver_unregister callback to clean up * their resources associated with the device and dealloc it. */ void ib_unregister_device_and_put(struct ib_device *ib_dev) { WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); ib_device_put(ib_dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_and_put); /** * ib_unregister_driver - Unregister all IB devices for a driver * @driver_id: The driver to unregister * * This implements a fence for device unregistration. It only returns once all * devices associated with the driver_id have fully completed their * unregistration and returned from ib_unregister_device*(). * * If device's are not yet unregistered it goes ahead and starts unregistering * them. * * This does not block creation of new devices with the given driver_id, that * is the responsibility of the caller. */ void ib_unregister_driver(enum rdma_driver_id driver_id) { struct ib_device *ib_dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, ib_dev) { if (ib_dev->ops.driver_id != driver_id) continue; get_device(&ib_dev->dev); up_read(&devices_rwsem); WARN_ON(!ib_dev->ops.dealloc_driver); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); } EXPORT_SYMBOL(ib_unregister_driver); static void ib_unregister_work(struct work_struct *work) { struct ib_device *ib_dev = container_of(work, struct ib_device, unregistration_work); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } /** * ib_unregister_device_queued - Unregister a device using a work queue * @ib_dev: The device to unregister * * This schedules an asynchronous unregistration using a WQ for the device. A * driver should use this to avoid holding locks while doing unregistration, * such as holding the RTNL lock. * * Drivers using this API must use ib_unregister_driver before module unload * to ensure that all scheduled unregistrations have completed. */ void ib_unregister_device_queued(struct ib_device *ib_dev) { WARN_ON(!refcount_read(&ib_dev->refcount)); WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_queued); /* * The caller must pass in a device that has the kref held and the refcount * released. If the device is in cur_net and still registered then it is moved * into net. */ static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net) { int ret2 = -EINVAL; int ret; mutex_lock(&device->unregistration_lock); /* * If a device not under ib_device_get() or if the unregistration_lock * is not held, the namespace can be changed, or it can be unregistered. * Check again under the lock. */ if (refcount_read(&device->refcount) == 0 || !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { ret = -ENODEV; goto out; } kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); disable_device(device); /* * At this point no one can be using the device, so it is safe to * change the namespace. */ write_pnet(&device->coredev.rdma_net, net); down_read(&devices_rwsem); /* * Currently rdma devices are system wide unique. So the device name * is guaranteed free in the new namespace. Publish the new namespace * at the sysfs level. */ ret = device_rename(&device->dev, dev_name(&device->dev)); up_read(&devices_rwsem); if (ret) { dev_warn(&device->dev, "%s: Couldn't rename device after namespace change\n", __func__); /* Try and put things back and re-enable the device */ write_pnet(&device->coredev.rdma_net, cur_net); } ret2 = enable_device_and_get(device); if (ret2) { /* * This shouldn't really happen, but if it does, let the user * retry at later point. So don't disable the device. */ dev_warn(&device->dev, "%s: Couldn't re-enable device after namespace change\n", __func__); } kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_put(device); out: mutex_unlock(&device->unregistration_lock); if (ret) return ret; return ret2; } int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd) { struct net *net; int ret; net = get_net_ns_by_fd(ns_fd); if (IS_ERR(net)) { ret = PTR_ERR(net); goto net_err; } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto ns_err; } /* * All the ib_clients, including uverbs, are reset when the namespace is * changed and this cannot be blocked waiting for userspace to do * something, so disassociation is mandatory. */ if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { ret = -EOPNOTSUPP; goto ns_err; } get_device(&dev->dev); ib_device_put(dev); ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); put_device(&dev->dev); put_net(net); return ret; ns_err: put_net(net); net_err: ib_device_put(dev); return ret; } static struct pernet_operations rdma_dev_net_ops = { .init = rdma_dev_init_net, .exit = rdma_dev_exit_net, .id = &rdma_dev_net_id, .size = sizeof(struct rdma_dev_net), }; static int assign_client_id(struct ib_client *client) { int ret; lockdep_assert_held(&clients_rwsem); /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in * registration order. */ client->client_id = highest_client_id; ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); if (ret) return ret; highest_client_id++; xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); return 0; } static void remove_client_id(struct ib_client *client) { down_write(&clients_rwsem); xa_erase(&clients, client->client_id); for (; highest_client_id; highest_client_id--) if (xa_load(&clients, highest_client_id - 1)) break; up_write(&clients_rwsem); } /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; unsigned long index; bool need_unreg = false; int ret; refcount_set(&client->uses, 1); init_completion(&client->uses_zero); /* * The devices_rwsem is held in write mode to ensure that a racing * ib_register_device() sees a consisent view of clients and devices. */ down_write(&devices_rwsem); down_write(&clients_rwsem); ret = assign_client_id(client); if (ret) goto out; need_unreg = true; xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { ret = add_client_context(device, client); if (ret) goto out; } ret = 0; out: up_write(&clients_rwsem); up_write(&devices_rwsem); if (need_unreg && ret) ib_unregister_client(client); return ret; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. * * This is a full fence, once it returns no client callbacks will be called, * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { struct ib_device *device; unsigned long index; down_write(&clients_rwsem); ib_client_put(client); xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&clients_rwsem); /* We do not want to have locks while calling client->remove() */ rcu_read_lock(); xa_for_each (&devices, index, device) { if (!ib_device_try_get(device)) continue; rcu_read_unlock(); remove_client_context(device, client->client_id); ib_device_put(device); rcu_read_lock(); } rcu_read_unlock(); /* * remove_client_context() is not a fence, it can return even though a * removal is ongoing. Wait until all removals are completed. */ wait_for_completion(&client->uses_zero); remove_client_id(client); } EXPORT_SYMBOL(ib_unregister_client); static int __ib_get_global_client_nl_info(const char *client_name, struct ib_client_nl_info *res) { struct ib_client *client; unsigned long index; int ret = -ENOENT; down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { if (strcmp(client->name, client_name) != 0) continue; if (!client->get_global_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_global_nl_info(res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&clients_rwsem); return ret; } static int __ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { unsigned long index; void *client_data; int ret = -ENOENT; down_read(&ibdev->client_data_rwsem); xan_for_each_marked (&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || strcmp(client->name, client_name) != 0) continue; if (!client->get_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_nl_info(ibdev, client_data, res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; /* * The cdev is guaranteed valid as long as we are inside the * client_data_rwsem as remove_one can't be called. Keep it * valid for the caller. */ if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&ibdev->client_data_rwsem); return ret; } /** * ib_get_client_nl_info - Fetch the nl_info from a client * @ibdev: IB device * @client_name: Name of the client * @res: Result of the query */ int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { int ret; if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); #ifdef CONFIG_MODULES if (ret == -ENOENT) { request_module("rdma-client-%s", client_name); if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); } #endif if (ret) { if (ret == -ENOENT) return -EOPNOTSUPP; return ret; } if (WARN_ON(!res->cdev)) return -EINVAL; return 0; } /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context data that can be retrieved with * ib_get_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { void *rc; if (WARN_ON(IS_ERR(data))) data = NULL; rc = xa_store(&device->client_data, client->client_id, data, GFP_KERNEL); WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback occurs in workqueue context. */ void ib_register_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ void ib_unregister_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_del(&event_handler->list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_unregister_event_handler); void ib_dispatch_event_clients(struct ib_event *event) { struct ib_event_handler *handler; down_read(&event->device->event_handler_rwsem); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); up_read(&event->device->event_handler_rwsem); } static int iw_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { struct in_device *inetdev; struct net_device *netdev; memset(port_attr, 0, sizeof(*port_attr)); netdev = ib_device_get_netdev(device, port_num); if (!netdev) return -ENODEV; port_attr->max_mtu = IB_MTU_4096; port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); if (!netif_carrier_ok(netdev)) { port_attr->state = IB_PORT_DOWN; port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; } else { rcu_read_lock(); inetdev = __in_dev_get_rcu(netdev); if (inetdev && inetdev->ifa_list) { port_attr->state = IB_PORT_ACTIVE; port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; } else { port_attr->state = IB_PORT_INIT; port_attr->phys_state = IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; } rcu_read_unlock(); } dev_put(netdev); return device->ops.query_port(device, port_num, port_attr); } static int __ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { int err; memset(port_attr, 0, sizeof(*port_attr)); err = device->ops.query_port(device, port_num, port_attr); if (err || port_attr->subnet_prefix) return err; if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) return 0; ib_get_cached_subnet_prefix(device, port_num, &port_attr->subnet_prefix); return 0; } /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (rdma_protocol_iwarp(device, port_num)) return iw_query_port(device, port_num, port_attr); else return __ib_query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); static void add_ndev_hash(struct ib_port_data *pdata) { unsigned long flags; might_sleep(); spin_lock_irqsave(&ndev_hash_lock, flags); if (hash_hashed(&pdata->ndev_hash_link)) { hash_del_rcu(&pdata->ndev_hash_link); spin_unlock_irqrestore(&ndev_hash_lock, flags); /* * We cannot do hash_add_rcu after a hash_del_rcu until the * grace period */ synchronize_rcu(); spin_lock_irqsave(&ndev_hash_lock, flags); } if (pdata->netdev) hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, (uintptr_t)pdata->netdev); spin_unlock_irqrestore(&ndev_hash_lock, flags); } /** * ib_device_set_netdev - Associate the ib_dev with an underlying net_device * @ib_dev: Device to modify * @ndev: net_device to affiliate, may be NULL * @port: IB port the net_device is connected to * * Drivers should use this to link the ib_device to a netdev so the netdev * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be * affiliated with any port. * * The caller must ensure that the given ndev is not unregistered or * unregistering, and that either the ib_device is unregistered or * ib_device_set_netdev() is called with NULL when the ndev sends a * NETDEV_UNREGISTER event. */ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, u32 port) { enum rdma_nl_notify_event_type etype; struct net_device *old_ndev; struct ib_port_data *pdata; unsigned long flags; int ret; if (!rdma_is_port_valid(ib_dev, port)) return -EINVAL; /* * Drivers wish to call this before ib_register_driver, so we have to * setup the port data early. */ ret = alloc_port_data(ib_dev); if (ret) return ret; pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); old_ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (old_ndev == ndev) { spin_unlock_irqrestore(&pdata->netdev_lock, flags); return 0; } rcu_assign_pointer(pdata->netdev, ndev); netdev_put(old_ndev, &pdata->netdev_tracker); netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); /* Make sure that the device is registered before we send events */ if (xa_load(&devices, ib_dev->index) != ib_dev) return 0; etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; rdma_nl_notify_event(ib_dev, port, etype); return 0; } EXPORT_SYMBOL(ib_device_set_netdev); static void free_netdevs(struct ib_device *ib_dev) { unsigned long flags; u32 port; if (!ib_dev->port_data) return; rdma_for_each_port (ib_dev, port) { struct ib_port_data *pdata = &ib_dev->port_data[port]; struct net_device *ndev; spin_lock_irqsave(&pdata->netdev_lock, flags); ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (ndev) { spin_lock(&ndev_hash_lock); hash_del_rcu(&pdata->ndev_hash_link); spin_unlock(&ndev_hash_lock); /* * If this is the last dev_put there is still a * synchronize_rcu before the netdev is kfreed, so we * can continue to rely on unlocked pointer * comparisons after the put */ rcu_assign_pointer(pdata->netdev, NULL); netdev_put(ndev, &pdata->netdev_tracker); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } } struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port) { struct ib_port_data *pdata; struct net_device *res; if (!rdma_is_port_valid(ib_dev, port)) return NULL; if (!ib_dev->port_data) return NULL; pdata = &ib_dev->port_data[port]; /* * New drivers should use ib_device_set_netdev() not the legacy * get_netdev(). */ if (ib_dev->ops.get_netdev) res = ib_dev->ops.get_netdev(ib_dev, port); else { spin_lock(&pdata->netdev_lock); res = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); dev_hold(res); spin_unlock(&pdata->netdev_lock); } return res; } EXPORT_SYMBOL(ib_device_get_netdev); /** * ib_query_netdev_port - Query the port number of a net_device * associated with an ibdev * @ibdev: IB device * @ndev: Network device * @port: IB port the net_device is connected to */ int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port) { struct net_device *ib_ndev; u32 port_num; rdma_for_each_port(ibdev, port_num) { ib_ndev = ib_device_get_netdev(ibdev, port_num); if (ndev == ib_ndev) { *port = port_num; dev_put(ib_ndev); return 0; } dev_put(ib_ndev); } return -ENOENT; } EXPORT_SYMBOL(ib_query_netdev_port); /** * ib_device_get_by_netdev - Find an IB device associated with a netdev * @ndev: netdev to locate * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device that is associated with a netdev via * ib_device_set_netdev(). The caller must call ib_device_put() on the * returned pointer. */ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id) { struct ib_device *res = NULL; struct ib_port_data *cur; rcu_read_lock(); hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, (uintptr_t)ndev) { if (rcu_access_pointer(cur->netdev) == ndev && (driver_id == RDMA_DRIVER_UNKNOWN || cur->ib_dev->ops.driver_id == driver_id) && ib_device_try_get(cur->ib_dev)) { res = cur->ib_dev; break; } } rcu_read_unlock(); return res; } EXPORT_SYMBOL(ib_device_get_by_netdev); /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all of the physical RoCE ports of ib_dev * which are related to netdevice and calls callback() on each * device for which filter() function returns non zero. */ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { u32 port; rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { struct net_device *idev = ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); dev_put(idev); } } /** * ib_enum_all_roce_netdevs - enumerate all RoCE devices * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all RoCE devices' physical ports which are related * to netdevices and calls callback() on each device for which * filter() function returns non zero. */ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&devices_rwsem); } /* * ib_enum_all_devs - enumerate all ib_devices * @cb: Callback to call for each found ib_device * * Enumerates all ib_devices and calls callback() on each device. */ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) continue; ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } up_read(&devices_rwsem); return ret; } /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (!device->ops.query_pkey) return -EOPNOTSUPP; return device->ops.query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { if (!device->ops.modify_device) return -EOPNOTSUPP; return device->ops.modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { int rc; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (device->ops.modify_port) rc = device->ops.modify_port(device, port_num, port_modify_mask, port_modify); else if (rdma_protocol_roce(device, port_num) && ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) rc = 0; else rc = -EOPNOTSUPP; return rc; } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index) { union ib_gid tmp_gid; u32 port; int ret, i; rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) continue; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { /* if there is full-member pkey take it.*/ if (tmp_pkey & 0x8000) { *index = i; return 0; } if (partial_ix < 0) partial_ix = i; } } /*no full-member, if exists take the limited*/ if (partial_ix >= 0) { *index = partial_ix; return 0; } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); /** * ib_get_net_dev_by_params() - Return the appropriate net_dev * for a received CM request * @dev: An RDMA device on which the request has been received. * @port: Port number on the RDMA device. * @pkey: The Pkey the request came on. * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr) { struct net_device *net_dev = NULL; unsigned long index; void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; /* * Holding the read side guarantees that the client will not become * unregistered while we are calling get_net_dev_by_params() */ down_read(&dev->client_data_rwsem); xan_for_each_marked (&dev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->get_net_dev_by_params) continue; net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, addr, client_data); if (net_dev) break; } up_read(&dev->client_data_rwsem); return net_dev; } EXPORT_SYMBOL(ib_get_net_dev_by_params); void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) { struct ib_device_ops *dev_ops = &dev->ops; #define SET_DEVICE_OP(ptr, name) \ do { \ if (ops->name) \ if (!((ptr)->name)) \ (ptr)->name = ops->name; \ } while (0) #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } if (ops->owner) { WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); dev_ops->owner = ops->owner; } if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; dev_ops->uverbs_no_driver_id_binding |= ops->uverbs_no_driver_id_binding; SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); SET_DEVICE_OP(dev_ops, alloc_pd); SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); SET_DEVICE_OP(dev_ops, alloc_ucontext); SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); SET_DEVICE_OP(dev_ops, counter_init); SET_DEVICE_OP(dev_ops, counter_unbind_qp); SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); SET_DEVICE_OP(dev_ops, create_srq); SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); SET_DEVICE_OP(dev_ops, dealloc_ucontext); SET_DEVICE_OP(dev_ops, dealloc_xrcd); SET_DEVICE_OP(dev_ops, del_gid); SET_DEVICE_OP(dev_ops, del_sub_dev); SET_DEVICE_OP(dev_ops, dereg_mr); SET_DEVICE_OP(dev_ops, destroy_ah); SET_DEVICE_OP(dev_ops, destroy_counters); SET_DEVICE_OP(dev_ops, destroy_cq); SET_DEVICE_OP(dev_ops, destroy_flow); SET_DEVICE_OP(dev_ops, destroy_flow_action); SET_DEVICE_OP(dev_ops, destroy_qp); SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); SET_DEVICE_OP(dev_ops, destroy_srq); SET_DEVICE_OP(dev_ops, destroy_wq); SET_DEVICE_OP(dev_ops, device_group); SET_DEVICE_OP(dev_ops, detach_mcast); SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); SET_DEVICE_OP(dev_ops, enable_driver); SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_mr_entry); SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_qp_entry); SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_srq_entry); SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); SET_DEVICE_OP(dev_ops, get_link_layer); SET_DEVICE_OP(dev_ops, get_netdev); SET_DEVICE_OP(dev_ops, get_numa_node); SET_DEVICE_OP(dev_ops, get_port_immutable); SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); SET_DEVICE_OP(dev_ops, iw_create_listen); SET_DEVICE_OP(dev_ops, iw_destroy_listen); SET_DEVICE_OP(dev_ops, iw_get_qp); SET_DEVICE_OP(dev_ops, iw_reject); SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); SET_DEVICE_OP(dev_ops, modify_device); SET_DEVICE_OP(dev_ops, modify_hw_stat); SET_DEVICE_OP(dev_ops, modify_port); SET_DEVICE_OP(dev_ops, modify_qp); SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); SET_DEVICE_OP(dev_ops, process_mad); SET_DEVICE_OP(dev_ops, query_ah); SET_DEVICE_OP(dev_ops, query_device); SET_DEVICE_OP(dev_ops, query_gid); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); SET_DEVICE_OP(dev_ops, reg_user_mr); SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); SET_DEVICE_OP(dev_ops, req_notify_cq); SET_DEVICE_OP(dev_ops, rereg_user_mr); SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); SET_DEVICE_OP(dev_ops, report_port_event); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_qp); SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); SET_OBJ_SIZE(dev_ops, ib_xrcd); SET_OBJ_SIZE(dev_ops, rdma_counter); } EXPORT_SYMBOL(ib_set_device_ops); int ib_add_sub_device(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name) { struct ib_device *sub; int ret = 0; if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) return -EOPNOTSUPP; if (!ib_device_try_get(parent)) return -EINVAL; sub = parent->ops.add_sub_dev(parent, type, name); if (IS_ERR(sub)) { ib_device_put(parent); return PTR_ERR(sub); } sub->type = type; sub->parent = parent; mutex_lock(&parent->subdev_lock); list_add_tail(&parent->subdev_list_head, &sub->subdev_list); mutex_unlock(&parent->subdev_lock); return ret; } EXPORT_SYMBOL(ib_add_sub_device); int ib_del_sub_device_and_put(struct ib_device *sub) { struct ib_device *parent = sub->parent; if (!parent) return -EOPNOTSUPP; mutex_lock(&parent->subdev_lock); list_del(&sub->subdev_list); mutex_unlock(&parent->subdev_lock); ib_device_put(sub); parent->ops.del_sub_dev(sub); ib_device_put(parent); return 0; } EXPORT_SYMBOL(ib_del_sub_device_and_put); #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) { struct scatterlist *s; int i; for_each_sg(sg, s, nents, i) { sg_dma_address(s) = (uintptr_t)sg_virt(s); sg_dma_len(s) = s->length; } return nents; } EXPORT_SYMBOL(ib_dma_virt_map_sg); #endif /* CONFIG_INFINIBAND_VIRT_DMA */ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { [RDMA_NL_LS_OP_RESOLVE] = { .doit = ib_nl_handle_resolve_resp, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { .doit = ib_nl_handle_set_timeout, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_IP_RESOLVE] = { .doit = ib_nl_handle_ip_res_resp, .flags = RDMA_NL_ADMIN_PERM, }, }; void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) { enum ib_port_state curr_state; struct ib_event ibevent = {}; u32 port; if (ib_query_netdev_port(ibdev, ndev, &port)) return; curr_state = ib_get_curr_port_state(ndev); write_lock_irq(&ibdev->cache_lock); if (ibdev->port_data[port].cache.last_port_state == curr_state) { write_unlock_irq(&ibdev->cache_lock); return; } ibdev->port_data[port].cache.last_port_state = curr_state; write_unlock_irq(&ibdev->cache_lock); ibevent.event = (curr_state == IB_PORT_DOWN) ? IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; ibevent.device = ibdev; ibevent.element.port_num = port; ib_dispatch_event(&ibevent); } EXPORT_SYMBOL(ib_dispatch_port_state_event); static void handle_port_event(struct net_device *ndev, unsigned long event) { struct ib_device *ibdev; /* Currently, link events in bonding scenarios are still * reported by drivers that support bonding. */ if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) return; ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return; if (ibdev->ops.report_port_event) { ibdev->ops.report_port_event(ibdev, ndev, event); goto put_ibdev; } ib_dispatch_port_state_event(ibdev, ndev); put_ibdev: ib_device_put(ibdev); }; static int ib_netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct ib_device *ibdev; u32 port; switch (event) { case NETDEV_CHANGENAME: ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return NOTIFY_DONE; if (ib_query_netdev_port(ibdev, ndev, &port)) { ib_device_put(ibdev); break; } rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); ib_device_put(ibdev); break; case NETDEV_UP: case NETDEV_CHANGE: case NETDEV_DOWN: handle_port_event(ndev, event); break; default: break; } return NOTIFY_DONE; } static struct notifier_block nb_netdevice = { .notifier_call = ib_netdevice_event, }; static int __init ib_core_init(void) { int ret = -ENOMEM; ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); if (!ib_unreg_wq) goto err; ib_comp_wq = alloc_workqueue("ib-comp-wq", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!ib_comp_wq) goto err_unbound; ib_comp_unbound_wq = alloc_workqueue("ib-comp-unb-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); if (!ib_comp_unbound_wq) goto err_comp; ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); goto err_comp_unbound; } rdma_nl_init(); ret = addr_init(); if (ret) { pr_warn("Couldn't init IB address resolution\n"); goto err_ibnl; } ret = ib_mad_init(); if (ret) { pr_warn("Couldn't init IB MAD\n"); goto err_addr; } ret = ib_sa_init(); if (ret) { pr_warn("Couldn't init SA\n"); goto err_mad; } ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); goto err_sa; } ret = register_pernet_device(&rdma_dev_net_ops); if (ret) { pr_warn("Couldn't init compat dev. ret %d\n", ret); goto err_compat; } nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ret = roce_gid_mgmt_init(); if (ret) { pr_warn("Couldn't init RoCE GID management\n"); goto err_parent; } register_netdevice_notifier(&nb_netdevice); return 0; err_parent: rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); err_compat: unregister_blocking_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: ib_mad_cleanup(); err_addr: addr_cleanup(); err_ibnl: class_unregister(&ib_class); err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err_unbound: destroy_workqueue(ib_unreg_wq); err: destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { unregister_netdevice_notifier(&nb_netdevice); roce_gid_mgmt_cleanup(); rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); destroy_workqueue(ib_unreg_wq); WARN_ON(!xa_empty(&clients)); WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); /* ib core relies on netdev stack to first register net_ns_type_operations * ns kobject type before ib_core initialization. */ fs_initcall(ib_core_init); module_exit(ib_core_cleanup);
5 44 113 114 113 114 29 85 133 4 124 5 2 3 5 5 71 69 22 42 1 5 2 2 1 1 22 23 23 23 22 23 802 785 19 19 19 19 154 144 2 2 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 // SPDX-License-Identifier: GPL-2.0-or-later /* * lwtunnel Infrastructure for light weight tunnels like mpls * * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/lwtunnel.h> #include <linux/in.h> #include <linux/init.h> #include <linux/err.h> #include <net/lwtunnel.h> #include <net/rtnetlink.h> #include <net/ip6_fib.h> #include <net/rtnh.h> #include "dev.h" DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); #ifdef CONFIG_MODULES static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) { /* Only lwt encaps implemented without using an interface for * the encap need to return a string here. */ switch (encap_type) { case LWTUNNEL_ENCAP_MPLS: return "MPLS"; case LWTUNNEL_ENCAP_ILA: return "ILA"; case LWTUNNEL_ENCAP_SEG6: return "SEG6"; case LWTUNNEL_ENCAP_BPF: return "BPF"; case LWTUNNEL_ENCAP_SEG6_LOCAL: return "SEG6LOCAL"; case LWTUNNEL_ENCAP_RPL: return "RPL"; case LWTUNNEL_ENCAP_IOAM6: return "IOAM6"; case LWTUNNEL_ENCAP_XFRM: /* module autoload not supported for encap type */ return NULL; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: case __LWTUNNEL_ENCAP_MAX: /* should not have got here */ WARN_ON(1); break; } return NULL; } #endif /* CONFIG_MODULES */ struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { struct lwtunnel_state *lws; lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); return lws; } EXPORT_SYMBOL_GPL(lwtunnel_state_alloc); static const struct lwtunnel_encap_ops __rcu * lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, unsigned int num) { if (num > LWTUNNEL_ENCAP_MAX) return -ERANGE; return !cmpxchg((const struct lwtunnel_encap_ops **) &lwtun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, unsigned int encap_type) { int ret; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) return -ERANGE; ret = (cmpxchg((const struct lwtunnel_encap_ops **) &lwtun_encaps[encap_type], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; bool found = false; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) { NL_SET_ERR_MSG_ATTR(extack, encap, "Unknown LWT encapsulation type"); return ret; } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); if (likely(ops && ops->build_state && try_module_get(ops->owner))) found = true; rcu_read_unlock(); if (found) { ret = ops->build_state(net, encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); } else { /* don't rely on -EOPNOTSUPP to detect match as build_state * handlers could return it */ NL_SET_ERR_MSG_ATTR(extack, encap, "LWT encapsulation type not supported"); } return ret; } EXPORT_SYMBOL_GPL(lwtunnel_build_state); int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, bool rtnl_is_held) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) { NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type"); return ret; } ops = rcu_access_pointer(lwtun_encaps[encap_type]); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { if (rtnl_is_held) __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); if (rtnl_is_held) rtnl_lock(); ops = rcu_access_pointer(lwtun_encaps[encap_type]); } } #endif ret = ops ? 0 : -EOPNOTSUPP; if (ret < 0) NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, struct netlink_ext_ack *extack, bool rtnl_is_held) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; struct nlattr *attrs; u16 encap_type; int attrlen; while (rtnh_ok(rtnh, remaining)) { attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { attrs = rtnh_attrs(rtnh); nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla_entype) { if (nla_len(nla_entype) < sizeof(u16)) { NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE"); return -EINVAL; } encap_type = nla_get_u16(nla_entype); if (lwtunnel_valid_encap_type(encap_type, extack, rtnl_is_held) != 0) return -EOPNOTSUPP; } } rtnh = rtnh_next(rtnh, &remaining); } return 0; } EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr); void lwtstate_free(struct lwtunnel_state *lws) { const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type]; if (ops->destroy_state) { ops->destroy_state(lws); kfree_rcu(lws, rcu); } else { kfree(lws); } module_put(ops->owner); } EXPORT_SYMBOL_GPL(lwtstate_free); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr) { const struct lwtunnel_encap_ops *ops; struct nlattr *nest; int ret; if (!lwtstate) return 0; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; nest = nla_nest_start_noflag(skb, encap_attr); if (!nest) return -EMSGSIZE; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->fill_encap)) ret = ops->fill_encap(skb, lwtstate); rcu_read_unlock(); if (ret) goto nla_put_failure; nla_nest_end(skb, nest); ret = nla_put_u16(skb, encap_type_attr, lwtstate->type); if (ret) goto nla_put_failure; return 0; nla_put_failure: nla_nest_cancel(skb, nest); return (ret == -EOPNOTSUPP ? 0 : ret); } EXPORT_SYMBOL_GPL(lwtunnel_fill_encap); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) { const struct lwtunnel_encap_ops *ops; int ret = 0; if (!lwtstate) return 0; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->get_encap_size)) ret = nla_total_size(ops->get_encap_size(lwtstate)); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { const struct lwtunnel_encap_ops *ops; int ret = 0; if (!a && !b) return 0; if (!a || !b) return 1; if (a->type != b->type) return 1; if (a->type == LWTUNNEL_ENCAP_NONE || a->type > LWTUNNEL_ENCAP_MAX) return 0; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[a->type]); if (likely(ops && ops->cmp_encap)) ret = ops->cmp_encap(a, b); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; struct dst_entry *dst; int ret; if (dev_xmit_recursion()) { net_crit_ratelimited("%s(): recursion limit reached on datapath\n", __func__); ret = -ENETDOWN; goto drop; } dst = skb_dst(skb); if (!dst) { ret = -EINVAL; goto drop; } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->output)) { dev_xmit_recursion_inc(); ret = ops->output(net, sk, skb); dev_xmit_recursion_dec(); } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; struct dst_entry *dst; int ret; if (dev_xmit_recursion()) { net_crit_ratelimited("%s(): recursion limit reached on datapath\n", __func__); ret = -ENETDOWN; goto drop; } dst = skb_dst(skb); if (!dst) { ret = -EINVAL; goto drop; } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->xmit)) { dev_xmit_recursion_inc(); ret = ops->xmit(skb); dev_xmit_recursion_dec(); } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; struct dst_entry *dst; int ret; if (dev_xmit_recursion()) { net_crit_ratelimited("%s(): recursion limit reached on datapath\n", __func__); ret = -ENETDOWN; goto drop; } dst = skb_dst(skb); if (!dst) { ret = -EINVAL; goto drop; } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->input)) { dev_xmit_recursion_inc(); ret = ops->input(skb); dev_xmit_recursion_dec(); } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_input);
4 15 2 1 1 9 2 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 // SPDX-License-Identifier: GPL-2.0-only /* * Xtables module to match the process control group. * * Might be used to implement individual "per-application" firewall * policies in contrast to global policies based on control groups. * Matching is based upon processes tagged to net_cls' classid marker. * * (C) 2013 Daniel Borkmann <dborkman@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/skbuff.h> #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_cgroup.h> #include <net/sock.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_DESCRIPTION("Xtables: process control group matching"); MODULE_ALIAS("ipt_cgroup"); MODULE_ALIAS("ip6t_cgroup"); static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v0 *info = par->matchinfo; if (info->invert & ~1) return -EINVAL; return 0; } static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v1 *info = par->matchinfo; struct cgroup *cgrp; if ((info->invert_path & ~1) || (info->invert_classid & ~1)) return -EINVAL; if (!info->has_path && !info->has_classid) { pr_info("xt_cgroup: no path or classid specified\n"); return -EINVAL; } if (info->has_path && info->has_classid) { pr_info_ratelimited("path and classid specified\n"); return -EINVAL; } info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); if (IS_ERR(cgrp)) { pr_info_ratelimited("invalid path, errno=%ld\n", PTR_ERR(cgrp)); return -EINVAL; } info->priv = cgrp; } return 0; } static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v2 *info = par->matchinfo; struct cgroup *cgrp; if ((info->invert_path & ~1) || (info->invert_classid & ~1)) return -EINVAL; if (!info->has_path && !info->has_classid) { pr_info("xt_cgroup: no path or classid specified\n"); return -EINVAL; } if (info->has_path && info->has_classid) { pr_info_ratelimited("path and classid specified\n"); return -EINVAL; } info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); if (IS_ERR(cgrp)) { pr_info_ratelimited("invalid path, errno=%ld\n", PTR_ERR(cgrp)); return -EINVAL; } info->priv = cgrp; } return 0; } static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info_v0 *info = par->matchinfo; struct sock *sk = skb->sk; if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ info->invert; } static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info_v1 *info = par->matchinfo; struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; struct cgroup *ancestor = info->priv; struct sock *sk = skb->sk; if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; } static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info_v2 *info = par->matchinfo; struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; struct cgroup *ancestor = info->priv; struct sock *sk = skb->sk; if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; } static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) { struct xt_cgroup_info_v1 *info = par->matchinfo; if (info->priv) cgroup_put(info->priv); } static void cgroup_mt_destroy_v2(const struct xt_mtdtor_param *par) { struct xt_cgroup_info_v2 *info = par->matchinfo; if (info->priv) cgroup_put(info->priv); } static struct xt_match cgroup_mt_reg[] __read_mostly = { { .name = "cgroup", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = cgroup_mt_check_v0, .match = cgroup_mt_v0, .matchsize = sizeof(struct xt_cgroup_info_v0), .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), }, { .name = "cgroup", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = cgroup_mt_check_v1, .match = cgroup_mt_v1, .matchsize = sizeof(struct xt_cgroup_info_v1), .usersize = offsetof(struct xt_cgroup_info_v1, priv), .destroy = cgroup_mt_destroy_v1, .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), }, { .name = "cgroup", .revision = 2, .family = NFPROTO_UNSPEC, .checkentry = cgroup_mt_check_v2, .match = cgroup_mt_v2, .matchsize = sizeof(struct xt_cgroup_info_v2), .usersize = offsetof(struct xt_cgroup_info_v2, priv), .destroy = cgroup_mt_destroy_v2, .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), }, }; static int __init cgroup_mt_init(void) { return xt_register_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg)); } static void __exit cgroup_mt_exit(void) { xt_unregister_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg)); } module_init(cgroup_mt_init); module_exit(cgroup_mt_exit);
5 5 6 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/export.h> #include "rds.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); EXPORT_PER_CPU_SYMBOL_GPL(rds_stats); /* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */ static const char *const rds_stat_names[] = { "conn_reset", "recv_drop_bad_checksum", "recv_drop_old_seq", "recv_drop_no_sock", "recv_drop_dead_sock", "recv_deliver_raced", "recv_delivered", "recv_queued", "recv_immediate_retry", "recv_delayed_retry", "recv_ack_required", "recv_rdma_bytes", "recv_ping", "send_queue_empty", "send_queue_full", "send_lock_contention", "send_lock_queue_raced", "send_immediate_retry", "send_delayed_retry", "send_drop_acked", "send_ack_required", "send_queued", "send_rdma", "send_rdma_bytes", "send_pong", "page_remainder_hit", "page_remainder_miss", "copy_to_user", "copy_from_user", "cong_update_queued", "cong_update_received", "cong_send_error", "cong_send_blocked", "recv_bytes_added_to_sock", "recv_bytes_freed_fromsock", "send_stuck_rm", }; void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr) { struct rds_info_counter ctr; size_t i; for (i = 0; i < nr; i++) { BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); strscpy_pad(ctr.name, names[i]); ctr.value = values[i]; rds_info_copy(iter, &ctr, sizeof(ctr)); } } EXPORT_SYMBOL_GPL(rds_stats_info_copy); /* * This gives global counters across all the transports. The strings * are copied in so that the tool doesn't need knowledge of the specific * stats that we're exporting. Some are pretty implementation dependent * and may change over time. That doesn't stop them from being useful. * * This is the only function in the chain that knows about the byte granular * length in userspace. It converts it to number of stat entries that the * rest of the functions operate in. */ static void rds_stats_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_statistics stats = {0, }; uint64_t *src; uint64_t *sum; size_t i; int cpu; unsigned int avail; avail = len / sizeof(struct rds_info_counter); if (avail < ARRAY_SIZE(rds_stat_names)) { avail = 0; goto trans; } for_each_online_cpu(cpu) { src = (uint64_t *)&(per_cpu(rds_stats, cpu)); sum = (uint64_t *)&stats; for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) *(sum++) += *(src++); } rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names, ARRAY_SIZE(rds_stat_names)); avail -= ARRAY_SIZE(rds_stat_names); trans: lens->each = sizeof(struct rds_info_counter); lens->nr = rds_trans_stats_info_copy(iter, avail) + ARRAY_SIZE(rds_stat_names); } void rds_stats_exit(void) { rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); } int rds_stats_init(void) { rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); return 0; }
3 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_TABLES_IPV6_H_ #define _NF_TABLES_IPV6_H_ #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/ipv6.h> #include <net/netfilter/nf_tables.h> static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt) { unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) { nft_set_pktinfo_unspec(pkt); return; } pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; } static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; u32 pkt_len, skb_len; int protohdr; ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*ip6h), &_ip6h); if (!ip6h) return -1; if (ip6h->version != 6) return -1; pkt_len = ntohs(ip6h->payload_len); skb_len = pkt->skb->len - skb_network_offset(pkt->skb); if (pkt_len + sizeof(*ip6h) > skb_len) return -1; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) return -1; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; return 0; #else return -1; #endif } static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { if (__nft_set_pktinfo_ipv6_validate(pkt) < 0) nft_set_pktinfo_unspec(pkt); } static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; unsigned short frag_off; unsigned int thoff = 0; struct inet6_dev *idev; struct ipv6hdr *ip6h; int protohdr; u32 pkt_len; if (!pskb_may_pull(pkt->skb, sizeof(*ip6h))) return -1; ip6h = ipv6_hdr(pkt->skb); if (ip6h->version != 6) goto inhdr_error; pkt_len = ntohs(ip6h->payload_len); if (pkt_len + sizeof(*ip6h) > pkt->skb->len) { idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) goto inhdr_error; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; return 0; inhdr_error: idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INHDRERRORS); return -1; #else return -1; #endif } #endif
2176 2171 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2013-2014 Intel Corp. */ #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/module.h> #include <linux/debugfs.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/netdev_lock.h> #include <net/pkt_sched.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/6lowpan.h> /* for the compression support */ #define VERSION "0.1" static struct dentry *lowpan_enable_debugfs; static struct dentry *lowpan_control_debugfs; #define IFACE_NAME_TEMPLATE "bt%d" struct skb_cb { struct in6_addr addr; struct in6_addr gw; struct l2cap_chan *chan; }; #define lowpan_cb(skb) ((struct skb_cb *)((skb)->cb)) /* The devices list contains those devices that we are acting * as a proxy. The BT 6LoWPAN device is a virtual device that * connects to the Bluetooth LE device. The real connection to * BT device is done via l2cap layer. There exists one * virtual device / one BT 6LoWPAN network (=hciX device). * The list contains struct lowpan_dev elements. */ static LIST_HEAD(bt_6lowpan_devices); static DEFINE_SPINLOCK(devices_lock); static bool enable_6lowpan; /* We are listening incoming connections via this channel */ static struct l2cap_chan *listen_chan; static DEFINE_MUTEX(set_lock); struct lowpan_peer { struct list_head list; struct rcu_head rcu; struct l2cap_chan *chan; /* peer addresses in various formats */ unsigned char lladdr[ETH_ALEN]; struct in6_addr peer_addr; }; struct lowpan_btle_dev { struct list_head list; struct hci_dev *hdev; struct net_device *netdev; struct list_head peers; atomic_t peer_count; /* number of items in peers list */ struct work_struct delete_netdev; struct delayed_work notify_peers; }; static inline struct lowpan_btle_dev * lowpan_btle_dev(const struct net_device *netdev) { return (struct lowpan_btle_dev *)lowpan_dev(netdev)->priv; } static inline void peer_add(struct lowpan_btle_dev *dev, struct lowpan_peer *peer) { list_add_rcu(&peer->list, &dev->peers); atomic_inc(&dev->peer_count); } static inline bool peer_del(struct lowpan_btle_dev *dev, struct lowpan_peer *peer) { list_del_rcu(&peer->list); kfree_rcu(peer, rcu); module_put(THIS_MODULE); if (atomic_dec_and_test(&dev->peer_count)) { BT_DBG("last peer"); return true; } return false; } static inline struct lowpan_peer * __peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan) { struct lowpan_peer *peer; list_for_each_entry_rcu(peer, &dev->peers, list) { if (peer->chan == chan) return peer; } return NULL; } static inline struct lowpan_peer * __peer_lookup_conn(struct lowpan_btle_dev *dev, struct l2cap_conn *conn) { struct lowpan_peer *peer; list_for_each_entry_rcu(peer, &dev->peers, list) { if (peer->chan->conn == conn) return peer; } return NULL; } static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, struct in6_addr *daddr, struct sk_buff *skb) { struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int count = atomic_read(&dev->peer_count); const struct in6_addr *nexthop; struct lowpan_peer *peer; struct neighbour *neigh; BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt); if (!rt) { if (ipv6_addr_any(&lowpan_cb(skb)->gw)) { /* There is neither route nor gateway, * probably the destination is a direct peer. */ nexthop = daddr; } else { /* There is a known gateway */ nexthop = &lowpan_cb(skb)->gw; } } else { nexthop = rt6_nexthop(rt, daddr); /* We need to remember the address because it is needed * by bt_xmit() when sending the packet. In bt_xmit(), the * destination routing info is not set. */ memcpy(&lowpan_cb(skb)->gw, nexthop, sizeof(struct in6_addr)); } BT_DBG("gw %pI6c", nexthop); rcu_read_lock(); list_for_each_entry_rcu(peer, &dev->peers, list) { BT_DBG("dst addr %pMR dst type %u ip %pI6c", &peer->chan->dst, peer->chan->dst_type, &peer->peer_addr); if (!ipv6_addr_cmp(&peer->peer_addr, nexthop)) { rcu_read_unlock(); return peer; } } /* use the neighbour cache for matching addresses assigned by SLAAC */ neigh = __ipv6_neigh_lookup(dev->netdev, nexthop); if (neigh) { list_for_each_entry_rcu(peer, &dev->peers, list) { if (!memcmp(neigh->ha, peer->lladdr, ETH_ALEN)) { neigh_release(neigh); rcu_read_unlock(); return peer; } } neigh_release(neigh); } rcu_read_unlock(); return NULL; } static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer = NULL; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { peer = __peer_lookup_conn(entry, conn); if (peer) break; } rcu_read_unlock(); return peer; } static struct lowpan_btle_dev *lookup_dev(struct l2cap_conn *conn) { struct lowpan_btle_dev *entry; struct lowpan_btle_dev *dev = NULL; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { if (conn->hcon->hdev == entry->hdev) { dev = entry; break; } } rcu_read_unlock(); return dev; } static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *skb_cp; skb_cp = skb_copy(skb, GFP_ATOMIC); if (!skb_cp) return NET_RX_DROP; return netif_rx(skb_cp); } static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, struct lowpan_peer *peer) { const u8 *saddr; saddr = peer->lladdr; return lowpan_header_decompress(skb, netdev, netdev->dev_addr, saddr); } static int recv_pkt(struct sk_buff *skb, struct net_device *dev, struct lowpan_peer *peer) { struct sk_buff *local_skb; int ret; if (!netif_running(dev)) goto drop; if (dev->type != ARPHRD_6LOWPAN || !skb->len) goto drop; skb_reset_network_header(skb); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto drop; /* check that it's our buffer */ if (lowpan_is_ipv6(*skb_network_header(skb))) { /* Pull off the 1-byte of 6lowpan header. */ skb_pull(skb, 1); /* Copy the packet so that the IPv6 header is * properly aligned. */ local_skb = skb_copy_expand(skb, NET_SKB_PAD - 1, skb_tailroom(skb), GFP_ATOMIC); if (!local_skb) goto drop; local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; local_skb->dev = dev; skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { kfree_skb(local_skb); goto drop; } dev->stats.rx_bytes += skb->len; dev->stats.rx_packets++; consume_skb(local_skb); consume_skb(skb); } else if (lowpan_is_iphc(*skb_network_header(skb))) { local_skb = skb_clone(skb, GFP_ATOMIC); if (!local_skb) goto drop; local_skb->dev = dev; ret = iphc_decompress(local_skb, dev, peer); if (ret < 0) { BT_DBG("iphc_decompress failed: %d", ret); kfree_skb(local_skb); goto drop; } local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { kfree_skb(local_skb); goto drop; } dev->stats.rx_bytes += skb->len; dev->stats.rx_packets++; consume_skb(local_skb); consume_skb(skb); } else { BT_DBG("unknown packet type"); goto drop; } return NET_RX_SUCCESS; drop: dev->stats.rx_dropped++; return NET_RX_DROP; } /* Packet from BT LE device */ static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { struct lowpan_btle_dev *dev; struct lowpan_peer *peer; int err; peer = lookup_peer(chan->conn); if (!peer) return -ENOENT; dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return -ENOENT; err = recv_pkt(skb, dev->netdev, peer); if (err) { BT_DBG("recv pkt %d", err); err = -EAGAIN; } return err; } static int setup_header(struct sk_buff *skb, struct net_device *netdev, bdaddr_t *peer_addr, u8 *peer_addr_type) { struct in6_addr ipv6_daddr; struct ipv6hdr *hdr; struct lowpan_btle_dev *dev; struct lowpan_peer *peer; u8 *daddr; int err, status = 0; hdr = ipv6_hdr(skb); dev = lowpan_btle_dev(netdev); memcpy(&ipv6_daddr, &hdr->daddr, sizeof(ipv6_daddr)); if (ipv6_addr_is_multicast(&ipv6_daddr)) { lowpan_cb(skb)->chan = NULL; daddr = NULL; } else { BT_DBG("dest IP %pI6c", &ipv6_daddr); /* The packet might be sent to 6lowpan interface * because of routing (either via default route * or user set route) so get peer according to * the destination address. */ peer = peer_lookup_dst(dev, &ipv6_daddr, skb); if (!peer) { BT_DBG("no such peer"); return -ENOENT; } daddr = peer->lladdr; *peer_addr = peer->chan->dst; *peer_addr_type = peer->chan->dst_type; lowpan_cb(skb)->chan = peer->chan; status = 1; } lowpan_header_compress(skb, netdev, daddr, dev->netdev->dev_addr); err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0); if (err < 0) return err; return status; } static int header_create(struct sk_buff *skb, struct net_device *netdev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) { if (type != ETH_P_IPV6) return -EINVAL; return 0; } /* Packet to BT LE device */ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, struct net_device *netdev) { struct msghdr msg; struct kvec iv; int err; /* Remember the skb so that we can send EAGAIN to the caller if * we run out of credits. */ chan->data = skb; iv.iov_base = skb->data; iv.iov_len = skb->len; memset(&msg, 0, sizeof(msg)); iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, skb->len); err = l2cap_chan_send(chan, &msg, skb->len, NULL); if (err > 0) { netdev->stats.tx_bytes += err; netdev->stats.tx_packets++; return 0; } if (err < 0) netdev->stats.tx_errors++; return err; } static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) { struct sk_buff *local_skb; struct lowpan_btle_dev *entry; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { struct lowpan_peer *pentry; struct lowpan_btle_dev *dev; if (entry->netdev != netdev) continue; dev = lowpan_btle_dev(entry->netdev); list_for_each_entry_rcu(pentry, &dev->peers, list) { int ret; local_skb = skb_clone(skb, GFP_ATOMIC); BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &pentry->chan->dst, pentry->chan->dst_type, &pentry->peer_addr, pentry->chan); ret = send_pkt(pentry->chan, local_skb, netdev); if (ret < 0) err = ret; kfree_skb(local_skb); } } rcu_read_unlock(); return err; } static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) { int err = 0; bdaddr_t addr; u8 addr_type; /* We must take a copy of the skb before we modify/replace the ipv6 * header as the header could be used elsewhere */ skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) return NET_XMIT_DROP; /* Return values from setup_header() * <0 - error, packet is dropped * 0 - this is a multicast packet * 1 - this is unicast packet */ err = setup_header(skb, netdev, &addr, &addr_type); if (err < 0) { kfree_skb(skb); return NET_XMIT_DROP; } if (err) { if (lowpan_cb(skb)->chan) { BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &addr, addr_type, &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan); err = send_pkt(lowpan_cb(skb)->chan, skb, netdev); } else { err = -ENOENT; } } else { /* We need to send the packet to every device behind this * interface. */ err = send_mcast_pkt(skb, netdev); } dev_kfree_skb(skb); if (err) BT_DBG("ERROR: xmit failed (%d)", err); return err < 0 ? NET_XMIT_DROP : err; } static int bt_dev_init(struct net_device *dev) { netdev_lockdep_set_classes(dev); return 0; } static const struct net_device_ops netdev_ops = { .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; static const struct header_ops header_ops = { .create = header_create, }; static void netdev_setup(struct net_device *dev) { dev->hard_header_len = 0; dev->needed_tailroom = 0; dev->flags = IFF_RUNNING | IFF_MULTICAST; dev->watchdog_timeo = 0; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->netdev_ops = &netdev_ops; dev->header_ops = &header_ops; dev->needs_free_netdev = true; } static const struct device_type bt_type = { .name = "bluetooth", }; static void ifup(struct net_device *netdev) { int err; rtnl_lock(); err = dev_open(netdev, NULL); if (err < 0) BT_INFO("iface %s cannot be opened (%d)", netdev->name, err); rtnl_unlock(); } static void ifdown(struct net_device *netdev) { rtnl_lock(); dev_close(netdev); rtnl_unlock(); } static void do_notify_peers(struct work_struct *work) { struct lowpan_btle_dev *dev = container_of(work, struct lowpan_btle_dev, notify_peers.work); netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */ } static bool is_bt_6lowpan(struct hci_conn *hcon) { if (hcon->type != LE_LINK) return false; if (!enable_6lowpan) return false; return true; } static struct l2cap_chan *chan_create(void) { struct l2cap_chan *chan; chan = l2cap_chan_create(); if (!chan) return NULL; l2cap_chan_set_defaults(chan); chan->chan_type = L2CAP_CHAN_CONN_ORIENTED; chan->mode = L2CAP_MODE_LE_FLOWCTL; chan->imtu = 1280; return chan; } static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, struct lowpan_btle_dev *dev, bool new_netdev) { struct lowpan_peer *peer; peer = kzalloc(sizeof(*peer), GFP_ATOMIC); if (!peer) return NULL; peer->chan = chan; baswap((void *)peer->lladdr, &chan->dst); lowpan_iphc_uncompress_eui48_lladdr(&peer->peer_addr, peer->lladdr); spin_lock(&devices_lock); INIT_LIST_HEAD(&peer->list); peer_add(dev, peer); spin_unlock(&devices_lock); /* Notifying peers about us needs to be done without locks held */ if (new_netdev) INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers); schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100)); return peer->chan; } static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) { struct net_device *netdev; bdaddr_t addr; int err; netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)), IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN, netdev_setup); if (!netdev) return -ENOMEM; netdev->addr_assign_type = NET_ADDR_PERM; baswap(&addr, &chan->src); __dev_addr_set(netdev, &addr, sizeof(addr)); netdev->netdev_ops = &netdev_ops; SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); *dev = lowpan_btle_dev(netdev); (*dev)->netdev = netdev; (*dev)->hdev = chan->conn->hcon->hdev; INIT_LIST_HEAD(&(*dev)->peers); spin_lock(&devices_lock); INIT_LIST_HEAD(&(*dev)->list); list_add_rcu(&(*dev)->list, &bt_6lowpan_devices); spin_unlock(&devices_lock); err = lowpan_register_netdev(netdev, LOWPAN_LLTYPE_BTLE); if (err < 0) { BT_INFO("register_netdev failed %d", err); spin_lock(&devices_lock); list_del_rcu(&(*dev)->list); spin_unlock(&devices_lock); free_netdev(netdev); goto out; } BT_DBG("ifindex %d peer bdaddr %pMR type %d my addr %pMR type %d", netdev->ifindex, &chan->dst, chan->dst_type, &chan->src, chan->src_type); set_bit(__LINK_STATE_PRESENT, &netdev->state); return 0; out: return err; } static inline void chan_ready_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; bool new_netdev = false; dev = lookup_dev(chan->conn); BT_DBG("chan %p conn %p dev %p", chan, chan->conn, dev); if (!dev) { if (setup_netdev(chan, &dev) < 0) { l2cap_chan_del(chan, -ENOENT); return; } new_netdev = true; } if (!try_module_get(THIS_MODULE)) return; add_peer_chan(chan, dev, new_netdev); ifup(dev->netdev); } static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; chan = chan_create(); if (!chan) return NULL; chan->ops = pchan->ops; BT_DBG("chan %p pchan %p", chan, pchan); return chan; } static void delete_netdev(struct work_struct *work) { struct lowpan_btle_dev *entry = container_of(work, struct lowpan_btle_dev, delete_netdev); lowpan_unregister_netdev(entry->netdev); /* The entry pointer is deleted by the netdev destructor. */ } static void chan_close_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *entry; struct lowpan_btle_dev *dev = NULL; struct lowpan_peer *peer; int err = -ENOENT; bool last = false, remove = true; BT_DBG("chan %p conn %p", chan, chan->conn); if (chan->conn && chan->conn->hcon) { if (!is_bt_6lowpan(chan->conn->hcon)) return; /* If conn is set, then the netdev is also there and we should * not remove it. */ remove = false; } spin_lock(&devices_lock); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { dev = lowpan_btle_dev(entry->netdev); peer = __peer_lookup_chan(dev, chan); if (peer) { last = peer_del(dev, peer); err = 0; BT_DBG("dev %p removing %speer %p", dev, last ? "last " : "1 ", peer); BT_DBG("chan %p orig refcnt %u", chan, kref_read(&chan->kref)); l2cap_chan_put(chan); break; } } if (!err && last && dev && !atomic_read(&dev->peer_count)) { spin_unlock(&devices_lock); cancel_delayed_work_sync(&dev->notify_peers); ifdown(dev->netdev); if (remove) { INIT_WORK(&entry->delete_netdev, delete_netdev); schedule_work(&entry->delete_netdev); } } else { spin_unlock(&devices_lock); } } static void chan_state_change_cb(struct l2cap_chan *chan, int state, int err) { BT_DBG("chan %p conn %p state %s err %d", chan, chan->conn, state_to_string(state), err); } static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { struct sk_buff *skb; /* Note that we must allocate using GFP_ATOMIC here as * this function is called originally from netdev hard xmit * function in atomic context. */ skb = bt_skb_alloc(hdr_len + len, GFP_ATOMIC); if (!skb) return ERR_PTR(-ENOMEM); return skb; } static void chan_suspend_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; BT_DBG("chan %p suspend", chan); dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return; netif_stop_queue(dev->netdev); } static void chan_resume_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; BT_DBG("chan %p resume", chan); dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return; netif_wake_queue(dev->netdev); } static long chan_get_sndtimeo_cb(struct l2cap_chan *chan) { return L2CAP_CONN_TIMEOUT; } static const struct l2cap_ops bt_6lowpan_chan_ops = { .name = "L2CAP 6LoWPAN channel", .new_connection = chan_new_conn_cb, .recv = chan_recv_cb, .close = chan_close_cb, .state_change = chan_state_change_cb, .ready = chan_ready_cb, .resume = chan_resume_cb, .suspend = chan_suspend_cb, .get_sndtimeo = chan_get_sndtimeo_cb, .alloc_skb = chan_alloc_skb_cb, .teardown = l2cap_chan_no_teardown, .defer = l2cap_chan_no_defer, .set_shutdown = l2cap_chan_no_set_shutdown, }; static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) { struct l2cap_chan *chan; int err; chan = chan_create(); if (!chan) return -EINVAL; chan->ops = &bt_6lowpan_chan_ops; err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0, addr, dst_type, L2CAP_CONN_TIMEOUT); BT_DBG("chan %p err %d", chan, err); if (err < 0) l2cap_chan_put(chan); return err; } static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) { struct lowpan_peer *peer; BT_DBG("conn %p dst type %u", conn, dst_type); peer = lookup_peer(conn); if (!peer) return -ENOENT; BT_DBG("peer %p chan %p", peer, peer->chan); l2cap_chan_close(peer->chan, ENOENT); return 0; } static struct l2cap_chan *bt_6lowpan_listen(void) { bdaddr_t *addr = BDADDR_ANY; struct l2cap_chan *chan; int err; if (!enable_6lowpan) return NULL; chan = chan_create(); if (!chan) return NULL; chan->ops = &bt_6lowpan_chan_ops; chan->state = BT_LISTEN; chan->src_type = BDADDR_LE_PUBLIC; atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); BT_DBG("chan %p src type %u", chan, chan->src_type); err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); if (err) { l2cap_chan_put(chan); BT_ERR("psm cannot be added err %d", err); return NULL; } return chan; } static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, struct l2cap_conn **conn) { struct hci_conn *hcon; struct hci_dev *hdev; int n; n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", &addr->b[5], &addr->b[4], &addr->b[3], &addr->b[2], &addr->b[1], &addr->b[0], addr_type); if (n < 7) return -EINVAL; /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */ hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC); if (!hdev) return -ENOENT; hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type); hci_dev_unlock(hdev); hci_dev_put(hdev); if (!hcon) return -ENOENT; *conn = (struct l2cap_conn *)hcon->l2cap_data; BT_DBG("conn %p dst %pMR type %u", *conn, &hcon->dst, hcon->dst_type); return 0; } static void disconnect_all_peers(void) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer, *tmp_peer, *new_peer; struct list_head peers; INIT_LIST_HEAD(&peers); /* We make a separate list of peers as the close_cb() will * modify the device peers list so it is better not to mess * with the same list at the same time. */ rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { list_for_each_entry_rcu(peer, &entry->peers, list) { new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC); if (!new_peer) break; new_peer->chan = peer->chan; INIT_LIST_HEAD(&new_peer->list); list_add(&new_peer->list, &peers); } } rcu_read_unlock(); spin_lock(&devices_lock); list_for_each_entry_safe(peer, tmp_peer, &peers, list) { l2cap_chan_close(peer->chan, ENOENT); list_del_rcu(&peer->list); kfree_rcu(peer, rcu); } spin_unlock(&devices_lock); } struct set_enable { struct work_struct work; bool flag; }; static void do_enable_set(struct work_struct *work) { struct set_enable *set_enable = container_of(work, struct set_enable, work); if (!set_enable->flag || enable_6lowpan != set_enable->flag) /* Disconnect existing connections if 6lowpan is * disabled */ disconnect_all_peers(); enable_6lowpan = set_enable->flag; mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); } listen_chan = bt_6lowpan_listen(); mutex_unlock(&set_lock); kfree(set_enable); } static int lowpan_enable_set(void *data, u64 val) { struct set_enable *set_enable; set_enable = kzalloc(sizeof(*set_enable), GFP_KERNEL); if (!set_enable) return -ENOMEM; set_enable->flag = !!val; INIT_WORK(&set_enable->work, do_enable_set); schedule_work(&set_enable->work); return 0; } static int lowpan_enable_get(void *data, u64 *val) { *val = enable_6lowpan; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(lowpan_enable_fops, lowpan_enable_get, lowpan_enable_set, "%llu\n"); static ssize_t lowpan_control_write(struct file *fp, const char __user *user_buffer, size_t count, loff_t *position) { char buf[32]; size_t buf_size = min(count, sizeof(buf) - 1); int ret; bdaddr_t addr; u8 addr_type; struct l2cap_conn *conn = NULL; if (copy_from_user(buf, user_buffer, buf_size)) return -EFAULT; buf[buf_size] = '\0'; if (memcmp(buf, "connect ", 8) == 0) { ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn); if (ret == -EINVAL) return ret; mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); listen_chan = NULL; } mutex_unlock(&set_lock); if (conn) { struct lowpan_peer *peer; if (!is_bt_6lowpan(conn->hcon)) return -EINVAL; peer = lookup_peer(conn); if (peer) { BT_DBG("6LoWPAN connection already exists"); return -EALREADY; } BT_DBG("conn %p dst %pMR type %d user %u", conn, &conn->hcon->dst, conn->hcon->dst_type, addr_type); } ret = bt_6lowpan_connect(&addr, addr_type); if (ret < 0) return ret; return count; } if (memcmp(buf, "disconnect ", 11) == 0) { ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn); if (ret < 0) return ret; ret = bt_6lowpan_disconnect(conn, addr_type); if (ret < 0) return ret; return count; } return count; } static int lowpan_control_show(struct seq_file *f, void *ptr) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer; spin_lock(&devices_lock); list_for_each_entry(entry, &bt_6lowpan_devices, list) { list_for_each_entry(peer, &entry->peers, list) seq_printf(f, "%pMR (type %u)\n", &peer->chan->dst, peer->chan->dst_type); } spin_unlock(&devices_lock); return 0; } static int lowpan_control_open(struct inode *inode, struct file *file) { return single_open(file, lowpan_control_show, inode->i_private); } static const struct file_operations lowpan_control_fops = { .open = lowpan_control_open, .read = seq_read, .write = lowpan_control_write, .llseek = seq_lseek, .release = single_release, }; static void disconnect_devices(void) { struct lowpan_btle_dev *entry, *tmp, *new_dev; struct list_head devices; INIT_LIST_HEAD(&devices); /* We make a separate list of devices because the unregister_netdev() * will call device_event() which will also want to modify the same * devices list. */ rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { new_dev = kmalloc(sizeof(*new_dev), GFP_ATOMIC); if (!new_dev) break; new_dev->netdev = entry->netdev; INIT_LIST_HEAD(&new_dev->list); list_add_rcu(&new_dev->list, &devices); } rcu_read_unlock(); list_for_each_entry_safe(entry, tmp, &devices, list) { ifdown(entry->netdev); BT_DBG("Unregistering netdev %s %p", entry->netdev->name, entry->netdev); lowpan_unregister_netdev(entry->netdev); kfree(entry); } } static int device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct lowpan_btle_dev *entry; if (netdev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: spin_lock(&devices_lock); list_for_each_entry(entry, &bt_6lowpan_devices, list) { if (entry->netdev == netdev) { BT_DBG("Unregistered netdev %s %p", netdev->name, netdev); list_del(&entry->list); break; } } spin_unlock(&devices_lock); break; } return NOTIFY_DONE; } static struct notifier_block bt_6lowpan_dev_notifier = { .notifier_call = device_event, }; static int __init bt_6lowpan_init(void) { lowpan_enable_debugfs = debugfs_create_file_unsafe("6lowpan_enable", 0644, bt_debugfs, NULL, &lowpan_enable_fops); lowpan_control_debugfs = debugfs_create_file("6lowpan_control", 0644, bt_debugfs, NULL, &lowpan_control_fops); return register_netdevice_notifier(&bt_6lowpan_dev_notifier); } static void __exit bt_6lowpan_exit(void) { debugfs_remove(lowpan_enable_debugfs); debugfs_remove(lowpan_control_debugfs); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); } disconnect_devices(); unregister_netdevice_notifier(&bt_6lowpan_dev_notifier); } module_init(bt_6lowpan_init); module_exit(bt_6lowpan_exit); MODULE_AUTHOR("Jukka Rissanen <jukka.rissanen@linux.intel.com>"); MODULE_DESCRIPTION("Bluetooth 6LoWPAN"); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL");
1586 1586 1589 230 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/srcu.h> #include <linux/interval_tree.h> struct mmu_notifier_subscriptions; struct mmu_notifier; struct mmu_notifier_range; struct mmu_interval_notifier; /** * enum mmu_notifier_event - reason for the mmu notifier callback * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that * move the range * * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like * madvise() or replacing a page by another one, ...). * * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range * ie using the vma access permission (vm_page_prot) to update the whole range * is enough no need to inspect changes to the CPU page table (mprotect() * syscall) * * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for * pages in the range so to mirror those changes the user must inspect the CPU * page table (from the end callback). * * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same * access flags). User should soft dirty the page in the end callback to make * sure that anyone relying on soft dirtiness catch pages that might be written * through non CPU mappings. * * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal * that the mm refcount is zero and the range is no longer accessible. * * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal * a device driver to possibly ignore the invalidation if the * owner field matches the driver's device private pgmap owner. * * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive. * The owner is initialized to the value provided by the caller of * make_device_exclusive(), such that this caller can filter out these * events. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, MMU_NOTIFY_CLEAR, MMU_NOTIFY_PROTECTION_VMA, MMU_NOTIFY_PROTECTION_PAGE, MMU_NOTIFY_SOFT_DIRTY, MMU_NOTIFY_RELEASE, MMU_NOTIFY_MIGRATE, MMU_NOTIFY_EXCLUSIVE, }; #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_ops { /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are * freed. This can run concurrently with other mmu notifier * methods (the ones invoked outside the mm context) and it * should tear down all secondary mmu mappings and freeze the * secondary mmu. If this method isn't implemented you've to * be sure that nothing could possibly write to the pages * through the secondary mmu by the time the last thread with * tsk->mm == mm exits. * * As side note: the pages freed after ->release returns could * be immediately reallocated by the gart at an alias physical * address with a different cache model, so if ->release isn't * implemented because all _software_ driven memory accesses * through the secondary mmu are terminated by the time the * last thread of this mm quits, you've also to be sure that * speculative _hardware_ operations can't allocate dirty * cachelines in the cpu that could not be snooped and made * coherent with the other read and write operations happening * through the gart alias address, so leading to memory * corruption. */ void (*release)(struct mmu_notifier *subscription, struct mm_struct *mm); /* * clear_flush_young is called after the VM is * test-and-clearing the young/accessed bitflag in the * pte. This way the VM will provide proper aging to the * accesses to the page through the secondary MMUs and not * only to the ones through the Linux pte. * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ int (*clear_flush_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * clear_young is a lightweight version of clear_flush_young. Like the * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ int (*clear_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * test_young is called to check the young/accessed bitflag in * the secondary pte. This is used to know if the page is * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ int (*test_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address); /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_lock and/or the * locks protecting the reverse maps are held. If the subsystem * can't guarantee that no additional references are taken to * the pages in the range, it has to implement the * invalidate_range() notifier to remove any references taken * after invalidate_range_start(). * * Invalidation of multiple concurrent ranges may be * optionally permitted by the driver. Either way the * establishment of sptes is forbidden in the range passed to * invalidate_range_begin/end for the whole duration of the * invalidate_range_begin/end critical section. * * invalidate_range_start() is called when all pages in the * range are still mapped and have at least a refcount of one. * * invalidate_range_end() is called when all pages in the * range have been unmapped and the pages have been freed by * the VM. * * The VM will remove the page table entries and potentially * the page between invalidate_range_start() and * invalidate_range_end(). If the page must not be freed * because of pending I/O or other circumstances then the * invalidate_range_start() callback (or the initial mapping * by the driver) must make sure that the refcount is kept * elevated. * * If the driver increases the refcount when the pages are * initially mapped into an address space then either * invalidate_range_start() or invalidate_range_end() may * decrease the refcount. If the refcount is decreased on * invalidate_range_start() then the VM can free pages as page * table entries are removed. If the refcount is only * dropped on invalidate_range_end() then the driver itself * will drop the last refcount but it must take care to flush * any secondary tlb before doing the final free on the * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. * * If blockable argument is set to false then the callback cannot * sleep and has to return with -EAGAIN if sleeping would be required. * 0 should be returned otherwise. Please note that notifiers that can * fail invalidate_range_start are not allowed to implement * invalidate_range_end, as there is no mechanism for informing the * notifier that its start failed. */ int (*invalidate_range_start)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); void (*invalidate_range_end)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); /* * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB * which shares page-tables with the CPU. The * invalidate_range_start()/end() callbacks should not be implemented as * invalidate_secondary_tlbs() already catches the points in time when * an external TLB needs to be flushed. * * This requires arch_invalidate_secondary_tlbs() to be called while * holding the ptl spin-lock and therefore this callback is not allowed * to sleep. * * This is called by architecture code whenever invalidating a TLB * entry. It is assumed that any secondary TLB has the same rules for * when invalidations are required. If this is not the case architecture * code will need to call this explicitly when required for secondary * TLB invalidation. */ void (*arch_invalidate_secondary_tlbs)( struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * These callbacks are used with the get/put interface to manage the * lifetime of the mmu_notifier memory. alloc_notifier() returns a new * notifier for use with the mm. * * free_notifier() is only called after the mmu_notifier has been * fully put, calls to any ops callback are prevented and no ops * callbacks are currently running. It is called from a SRCU callback * and cannot sleep. */ struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); void (*free_notifier)(struct mmu_notifier *subscription); }; /* * The notifier chains are protected by mmap_lock and/or the reverse map * semaphores. Notifier chains are only changed when all reverse maps and * the mmap_lock locks are taken. * * Therefore notifier chains can only be traversed when either * * 1. mmap_lock is held. * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; struct mm_struct *mm; struct rcu_head rcu; unsigned int users; }; /** * struct mmu_interval_notifier_ops * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. */ struct mmu_interval_notifier_ops { bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); }; struct mmu_interval_notifier { struct interval_tree_node interval_tree; const struct mmu_interval_notifier_ops *ops; struct mm_struct *mm; struct hlist_node deferred_item; unsigned long invalidate_seq; }; #ifdef CONFIG_MMU_NOTIFIER #ifdef CONFIG_LOCKDEP extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; enum mmu_notifier_event event; void *owner; }; static inline int mm_has_notifiers(struct mm_struct *mm) { return unlikely(mm->notifier_subscriptions); } struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, struct mm_struct *mm); static inline struct mmu_notifier * mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) { struct mmu_notifier *ret; mmap_write_lock(mm); ret = mmu_notifier_get_locked(ops, mm); mmap_write_unlock(mm); return ret; } void mmu_notifier_put(struct mmu_notifier *subscription); void mmu_notifier_synchronize(void); extern int mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern int __mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *subscription, struct mm_struct *mm); unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub); int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); int mmu_interval_notifier_insert_locked( struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub); /** * mmu_interval_set_seq - Save the invalidation sequence * @interval_sub - The subscription passed to invalidate * @cur_seq - The cur_seq passed to the invalidate() callback * * This must be called unconditionally from the invalidate callback of a * struct mmu_interval_notifier_ops under the same lock that is used to call * mmu_interval_read_retry(). It updates the sequence number for later use by * mmu_interval_read_retry(). The provided cur_seq will always be odd. * * If the caller does not call mmu_interval_read_begin() or * mmu_interval_read_retry() then this call is not required. */ static inline void mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, unsigned long cur_seq) { WRITE_ONCE(interval_sub->invalidate_seq, cur_seq); } /** * mmu_interval_read_retry - End a read side critical section against a VA range * interval_sub: The subscription * seq: The return of the paired mmu_interval_read_begin() * * This MUST be called under a user provided lock that is also held * unconditionally by op->invalidate() when it calls mmu_interval_set_seq(). * * Each call should be paired with a single mmu_interval_read_begin() and * should be used to conclude the read side. * * Returns true if an invalidation collided with this critical section, and * the caller should retry. */ static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { return interval_sub->invalidate_seq != seq; } /** * mmu_interval_check_retry - Test if a collision has occurred * interval_sub: The subscription * seq: The return of the matching mmu_interval_read_begin() * * This can be used in the critical section between mmu_interval_read_begin() * and mmu_interval_read_retry(). A return of true indicates an invalidation * has collided with this critical region and a future * mmu_interval_read_retry() will return true. * * False is not reliable and only suggests a collision may not have * occurred. It can be called many times and does not have to hold the user * provided lock. * * This call can be used as part of loops and other expensive operations to * expedite a retry. */ static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ return READ_ONCE(interval_sub->invalidate_seq) != seq; } extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_release(mm); } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_flush_young(mm, start, end); return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_young(mm, start, end); return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { if (mm_has_notifiers(mm)) return __mmu_notifier_test_young(mm, address); return 0; } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { might_sleep(); lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); } /* * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it * can return an error if a notifier can't proceed without blocking, in which * case you're not allowed to modify PTEs in the specified range. * * This is mainly intended for OOM handling. */ static inline int __must_check mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { int ret = 0; lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; ret = __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); return ret; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { if (mmu_notifier_range_blockable(range)) might_sleep(); if (mm_has_notifiers(range->mm)) __mmu_notifier_invalidate_range_end(range); } static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { mm->notifier_subscriptions = NULL; } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_subscriptions_destroy(mm); } static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, struct mm_struct *mm, unsigned long start, unsigned long end) { range->event = event; range->mm = mm; range->start = start; range->end = end; range->flags = flags; } static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, struct mm_struct *mm, unsigned long start, unsigned long end, void *owner) { mmu_notifier_range_init(range, event, flags, mm, start, end); range->owner = owner; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PAGE_SIZE); \ __young; \ }) #define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PMD_SIZE); \ __young; \ }) #define ptep_clear_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PAGE_SIZE); \ __young; \ }) #define pmdp_clear_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PMD_SIZE); \ __young; \ }) #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { unsigned long start; unsigned long end; }; static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { range->start = start; range->end = end; } #define mmu_notifier_range_init(range,event,flags,mm,start,end) \ _mmu_notifier_range_init(range, start, end) #define mmu_notifier_range_init_owner(range, event, flags, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return true; } static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; } static inline void mmu_notifier_release(struct mm_struct *mm) { } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { return 0; } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { return 0; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { } #define mmu_notifier_range_update_to_read_only(r) false #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young #define pmdp_clear_young_notify pmdp_test_and_clear_young #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush #define pudp_huge_clear_flush_notify pudp_huge_clear_flush static inline void mmu_notifier_synchronize(void) { } #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */
15 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* * net/tipc/eth_media.c: Ethernet bearer support for TIPC * * Copyright (c) 2001-2007, 2013-2014, Ericsson AB * Copyright (c) 2005-2008, 2011-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "bearer.h" /* Convert Ethernet address (media address format) to string */ static int tipc_eth_addr2str(struct tipc_media_addr *addr, char *strbuf, int bufsz) { if (bufsz < 18) /* 18 = strlen("aa:bb:cc:dd:ee:ff\0") */ return 1; sprintf(strbuf, "%pM", addr->value); return 0; } /* Convert from media address format to discovery message addr format */ static int tipc_eth_addr2msg(char *msg, struct tipc_media_addr *addr) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_ETH; memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, addr->value, ETH_ALEN); return 0; } /* Convert raw mac address format to media addr format */ static int tipc_eth_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *msg) { memset(addr, 0, sizeof(*addr)); ether_addr_copy(addr->value, msg); addr->media_id = TIPC_MEDIA_TYPE_ETH; addr->broadcast = is_broadcast_ether_addr(addr->value); return 0; } /* Convert discovery msg addr format to Ethernet media addr format */ static int tipc_eth_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { /* Skip past preamble: */ msg += TIPC_MEDIA_ADDR_OFFSET; return tipc_eth_raw2addr(b, addr, msg); } /* Ethernet media registration info */ struct tipc_media eth_media_info = { .send_msg = tipc_l2_send_msg, .enable_media = tipc_enable_l2_media, .disable_media = tipc_disable_l2_media, .addr2str = tipc_eth_addr2str, .addr2msg = tipc_eth_addr2msg, .msg2addr = tipc_eth_msg2addr, .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_MAX_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" };
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/sysctl.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/in.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <asm/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/arp.h> #include <net/ax25.h> #include <net/rose.h> static int rose_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { unsigned char *buff = skb_push(skb, ROSE_MIN_LEN + 2); if (daddr) memcpy(buff + 7, daddr, dev->addr_len); *buff++ = ROSE_GFI | ROSE_Q_BIT; *buff++ = 0x00; *buff++ = ROSE_DATA; *buff++ = 0x7F; *buff++ = AX25_P_IP; if (daddr != NULL) return 37; return -37; } static int rose_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; int err; if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) return 0; if (dev->flags & IFF_UP) { err = rose_add_loopback_node((rose_address *)sa->sa_data); if (err) return err; rose_del_loopback_node((const rose_address *)dev->dev_addr); } dev_addr_set(dev, sa->sa_data); return 0; } static int rose_open(struct net_device *dev) { int err; err = rose_add_loopback_node((const rose_address *)dev->dev_addr); if (err) return err; netif_start_queue(dev); return 0; } static int rose_close(struct net_device *dev) { netif_stop_queue(dev); rose_del_loopback_node((const rose_address *)dev->dev_addr); return 0; } static netdev_tx_t rose_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len = skb->len; if (!netif_running(dev)) { printk(KERN_ERR "ROSE: rose_xmit - called when iface is down\n"); return NETDEV_TX_BUSY; } if (!rose_route_frame(skb, NULL)) { dev_kfree_skb(skb); stats->tx_errors++; return NETDEV_TX_OK; } stats->tx_packets++; stats->tx_bytes += len; return NETDEV_TX_OK; } static const struct header_ops rose_header_ops = { .create = rose_header, }; static const struct net_device_ops rose_netdev_ops = { .ndo_open = rose_open, .ndo_stop = rose_close, .ndo_start_xmit = rose_xmit, .ndo_set_mac_address = rose_set_mac_address, }; void rose_setup(struct net_device *dev) { dev->mtu = ROSE_MAX_PACKET_SIZE - 2; dev->netdev_ops = &rose_netdev_ops; dev->header_ops = &rose_header_ops; dev->hard_header_len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; dev->addr_len = ROSE_ADDR_LEN; dev->type = ARPHRD_ROSE; /* New-style flags. */ dev->flags = IFF_NOARP; }
86 86 770 692 86 667 669 16 2 2 3 3 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/namespace.c * Copyright (C) 2006 Pavel Emelyanov <xemul@openvz.org> OpenVZ, SWsoft Inc. */ #include <linux/ipc.h> #include <linux/msg.h> #include <linux/ipc_namespace.h> #include <linux/rcupdate.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> #include "util.h" /* * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. */ static void free_ipc(struct work_struct *unused); static DECLARE_WORK(free_ipc_work, free_ipc); static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES); } static void dec_ipc_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES); } static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, struct ipc_namespace *old_ns) { struct ipc_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; again: ucounts = inc_ipc_namespaces(user_ns); if (!ucounts) { /* * IPC namespaces are freed asynchronously, by free_ipc_work. * If frees were pending, flush_work will wait, and * return true. Fail the allocation if no frees are pending. */ if (flush_work(&free_ipc_work)) goto again; goto fail; } err = -ENOMEM; ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; err = ns_alloc_inum(&ns->ns); if (err) goto fail_free; ns->ns.ops = &ipcns_operations; refcount_set(&ns->ns.count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; err = mq_init_ns(ns); if (err) goto fail_put; err = -ENOMEM; if (!setup_mq_sysctls(ns)) goto fail_put; if (!setup_ipc_sysctls(ns)) goto fail_mq; err = msg_init_ns(ns); if (err) goto fail_ipc; sem_init_ns(ns); shm_init_ns(ns); return ns; fail_ipc: retire_ipc_sysctls(ns); fail_mq: retire_mq_sysctls(ns); fail_put: put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); fail_free: kfree(ns); fail_dec: dec_ipc_namespaces(ucounts); fail: return ERR_PTR(err); } struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); return create_ipc_ns(user_ns, ns); } /* * free_ipcs - free all ipcs of one type * @ns: the namespace to remove the ipcs from * @ids: the table of ipcs to free * @free: the function called to free each individual ipc * * Called for each kind of ipc when an ipc_namespace exits. */ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, void (*free)(struct ipc_namespace *, struct kern_ipc_perm *)) { struct kern_ipc_perm *perm; int next_id; int total, in_use; down_write(&ids->rwsem); in_use = ids->in_use; for (total = 0, next_id = 0; total < in_use; next_id++) { perm = idr_find(&ids->ipcs_idr, next_id); if (perm == NULL) continue; rcu_read_lock(); ipc_lock_object(perm); free(ns, perm); total++; } up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) { /* * Caller needs to wait for an RCU grace period to have passed * after making the mount point inaccessible to new accesses. */ mntput(ns->mq_mnt); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); retire_mq_sysctls(ns); retire_ipc_sysctls(ns); dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } static LLIST_HEAD(free_ipc_list); static void free_ipc(struct work_struct *unused) { struct llist_node *node = llist_del_all(&free_ipc_list); struct ipc_namespace *n, *t; llist_for_each_entry_safe(n, t, node, mnt_llist) mnt_make_shortterm(n->mq_mnt); /* Wait for any last users to have gone away. */ synchronize_rcu(); llist_for_each_entry_safe(n, t, node, mnt_llist) free_ipc_ns(n); } /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put * * If this is the last task in the namespace exiting, and * it is dropping the refcount to 0, then it can race with * a task in another ipc namespace but in a mounts namespace * which has this ipcns's mqueuefs mounted, doing some action * with one of the mqueuefs files. That can raise the refcount. * So dropping the refcount, and raising the refcount when * accessing it through the VFS, are protected with mq_lock. * * (Clearly, a task raising the refcount on its own ipc_ns * needn't take mq_lock since it can't race with the last task * in the ipcns exiting). */ void put_ipc_ns(struct ipc_namespace *ns) { if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); if (llist_add(&ns->mnt_llist, &free_ipc_list)) schedule_work(&free_ipc_work); } } static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) { return container_of(ns, struct ipc_namespace, ns); } static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) ns = get_ipc_ns(nsproxy->ipc_ns); task_unlock(task); return ns ? &ns->ns : NULL; } static void ipcns_put(struct ns_common *ns) { return put_ipc_ns(to_ipc_ns(ns)); } static int ipcns_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct ipc_namespace *ns = to_ipc_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_ipc_ns(nsproxy->ipc_ns); nsproxy->ipc_ns = get_ipc_ns(ns); return 0; } static struct user_namespace *ipcns_owner(struct ns_common *ns) { return to_ipc_ns(ns)->user_ns; } const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, .owner = ipcns_owner, };
17 17 12 3 1 1 3 14 16 16 8 8 16 16 16 16 2 14 16 19 19 10 16 21 21 21 21 10 10 10 10 1 22 26 24 23 9 23 112 113 4 4 112 4 4 4 7 1 5 2 6 6 4 5 68 30 6 18 17 6 109 108 1 109 20 20 2 2 1 14 2 2 14 15 15 15 13 2 9 2 15 19 19 16 16 19 21 109 2174 2166 65 59 27 7 7 1 4 4 5 21 21 3 19 21 16 5 756 692 670 19 19 394 375 19 1 214 210 2 12 56 54 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #include <linux/ethtool.h> #include <net/netdev_lock.h> #include "ipvlan.h" static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, struct netlink_ext_ack *extack) { struct ipvl_dev *ipvlan; unsigned int flags; int err; ASSERT_RTNL(); if (port->mode != nval) { list_for_each_entry(ipvlan, &port->ipvlans, pnode) { flags = ipvlan->dev->flags; if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) { err = dev_change_flags(ipvlan->dev, flags | IFF_NOARP, extack); } else { err = dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP, extack); } if (unlikely(err)) goto fail; } if (nval == IPVLAN_MODE_L3S) { /* New mode is L3S */ err = ipvlan_l3s_register(port); if (err) goto fail; } else if (port->mode == IPVLAN_MODE_L3S) { /* Old mode was L3S */ ipvlan_l3s_unregister(port); } port->mode = nval; } return 0; fail: /* Undo the flags changes that have been done so far. */ list_for_each_entry_continue_reverse(ipvlan, &port->ipvlans, pnode) { flags = ipvlan->dev->flags; if (port->mode == IPVLAN_MODE_L3 || port->mode == IPVLAN_MODE_L3S) dev_change_flags(ipvlan->dev, flags | IFF_NOARP, NULL); else dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP, NULL); } return err; } static int ipvlan_port_create(struct net_device *dev) { struct ipvl_port *port; int err, idx; port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL); if (!port) return -ENOMEM; write_pnet(&port->pnet, dev_net(dev)); port->dev = dev; port->mode = IPVLAN_MODE_L3; INIT_LIST_HEAD(&port->ipvlans); for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++) INIT_HLIST_HEAD(&port->hlhead[idx]); skb_queue_head_init(&port->backlog); INIT_WORK(&port->wq, ipvlan_process_multicast); ida_init(&port->ida); port->dev_id_start = 1; err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); if (err) goto err; netdev_hold(dev, &port->dev_tracker, GFP_KERNEL); return 0; err: kfree(port); return err; } static void ipvlan_port_destroy(struct net_device *dev) { struct ipvl_port *port = ipvlan_port_get_rtnl(dev); struct sk_buff *skb; netdev_put(dev, &port->dev_tracker); if (port->mode == IPVLAN_MODE_L3S) ipvlan_l3s_unregister(port); netdev_rx_handler_unregister(dev); cancel_work_sync(&port->wq); while ((skb = __skb_dequeue(&port->backlog)) != NULL) { dev_put(skb->dev); kfree_skb(skb); } ida_destroy(&port->ida); kfree(port); } #define IPVLAN_ALWAYS_ON_OFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL) #define IPVLAN_ALWAYS_ON \ (IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_VLAN_CHALLENGED) #define IPVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \ NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) /* NETIF_F_GSO_ENCAP_ALL NETIF_F_GSO_SOFTWARE Newly added */ #define IPVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) static int ipvlan_init(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_port *port; int err; dev->state = (dev->state & ~IPVLAN_STATE_MASK) | (phy_dev->state & IPVLAN_STATE_MASK); dev->features = phy_dev->features & IPVLAN_FEATURES; dev->features |= IPVLAN_ALWAYS_ON; dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES; dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; dev->hw_enc_features |= dev->features; dev->lltx = true; netif_inherit_tso_max(dev, phy_dev); dev->hard_header_len = phy_dev->hard_header_len; netdev_lockdep_set_classes(dev); ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); if (!ipvlan->pcpu_stats) return -ENOMEM; if (!netif_is_ipvlan_port(phy_dev)) { err = ipvlan_port_create(phy_dev); if (err < 0) { free_percpu(ipvlan->pcpu_stats); return err; } } port = ipvlan_port_get_rtnl(phy_dev); port->count += 1; return 0; } static void ipvlan_uninit(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_port *port; free_percpu(ipvlan->pcpu_stats); port = ipvlan_port_get_rtnl(phy_dev); port->count -= 1; if (!port->count) ipvlan_port_destroy(port->dev); } static int ipvlan_open(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_addr *addr; if (ipvlan->port->mode == IPVLAN_MODE_L3 || ipvlan->port->mode == IPVLAN_MODE_L3S) dev->flags |= IFF_NOARP; else dev->flags &= ~IFF_NOARP; rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_add(ipvlan, addr); rcu_read_unlock(); return 0; } static int ipvlan_stop(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_addr *addr; dev_uc_unsync(phy_dev, dev); dev_mc_unsync(phy_dev, dev); rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_del(addr); rcu_read_unlock(); return 0; } static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); int skblen = skb->len; int ret; ret = ipvlan_queue_xmit(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct ipvl_pcpu_stats *pcptr; pcptr = this_cpu_ptr(ipvlan->pcpu_stats); u64_stats_update_begin(&pcptr->syncp); u64_stats_inc(&pcptr->tx_pkts); u64_stats_add(&pcptr->tx_bytes, skblen); u64_stats_update_end(&pcptr->syncp); } else { this_cpu_inc(ipvlan->pcpu_stats->tx_drps); } return ret; } static netdev_features_t ipvlan_fix_features(struct net_device *dev, netdev_features_t features) { struct ipvl_dev *ipvlan = netdev_priv(dev); features |= NETIF_F_ALL_FOR_ALL; features &= (ipvlan->sfeatures | ~IPVLAN_FEATURES); features = netdev_increment_features(ipvlan->phy_dev->features, features, features); features |= IPVLAN_ALWAYS_ON; features &= (IPVLAN_FEATURES | IPVLAN_ALWAYS_ON); return features; } static void ipvlan_change_rx_flags(struct net_device *dev, int change) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; if (change & IFF_ALLMULTI) dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1); } static void ipvlan_set_multicast_mac_filter(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE); } else { struct netdev_hw_addr *ha; DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE); bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE); netdev_for_each_mc_addr(ha, dev) __set_bit(ipvlan_mac_hash(ha->addr), mc_filters); /* Turn-on broadcast bit irrespective of address family, * since broadcast is deferred to a work-queue, hence no * impact on fast-path processing. */ __set_bit(ipvlan_mac_hash(dev->broadcast), mc_filters); bitmap_copy(ipvlan->mac_filters, mc_filters, IPVLAN_MAC_FILTER_SIZE); } dev_uc_sync(ipvlan->phy_dev, dev); dev_mc_sync(ipvlan->phy_dev, dev); } static void ipvlan_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { struct ipvl_dev *ipvlan = netdev_priv(dev); if (ipvlan->pcpu_stats) { struct ipvl_pcpu_stats *pcptr; u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes; u32 rx_errs = 0, tx_drps = 0; u32 strt; int idx; for_each_possible_cpu(idx) { pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx); do { strt = u64_stats_fetch_begin(&pcptr->syncp); rx_pkts = u64_stats_read(&pcptr->rx_pkts); rx_bytes = u64_stats_read(&pcptr->rx_bytes); rx_mcast = u64_stats_read(&pcptr->rx_mcast); tx_pkts = u64_stats_read(&pcptr->tx_pkts); tx_bytes = u64_stats_read(&pcptr->tx_bytes); } while (u64_stats_fetch_retry(&pcptr->syncp, strt)); s->rx_packets += rx_pkts; s->rx_bytes += rx_bytes; s->multicast += rx_mcast; s->tx_packets += tx_pkts; s->tx_bytes += tx_bytes; /* u32 values are updated without syncp protection. */ rx_errs += READ_ONCE(pcptr->rx_errs); tx_drps += READ_ONCE(pcptr->tx_drps); } s->rx_errors = rx_errs; s->rx_dropped = rx_errs; s->tx_dropped = tx_drps; } s->tx_errors = DEV_STATS_READ(dev, tx_errors); } static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; return vlan_vid_add(phy_dev, proto, vid); } static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; vlan_vid_del(phy_dev, proto, vid); return 0; } static int ipvlan_get_iflink(const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); return READ_ONCE(ipvlan->phy_dev->ifindex); } static const struct net_device_ops ipvlan_netdev_ops = { .ndo_init = ipvlan_init, .ndo_uninit = ipvlan_uninit, .ndo_open = ipvlan_open, .ndo_stop = ipvlan_stop, .ndo_start_xmit = ipvlan_start_xmit, .ndo_fix_features = ipvlan_fix_features, .ndo_change_rx_flags = ipvlan_change_rx_flags, .ndo_set_rx_mode = ipvlan_set_multicast_mac_filter, .ndo_get_stats64 = ipvlan_get_stats64, .ndo_vlan_rx_add_vid = ipvlan_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ipvlan_vlan_rx_kill_vid, .ndo_get_iflink = ipvlan_get_iflink, }; static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { const struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; /* TODO Probably use a different field than dev_addr so that the * mac-address on the virtual device is portable and can be carried * while the packets use the mac-addr on the physical device. */ return dev_hard_header(skb, phy_dev, type, daddr, saddr ? : phy_dev->dev_addr, len); } static const struct header_ops ipvlan_header_ops = { .create = ipvlan_hard_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, .parse_protocol = eth_header_parse_protocol, }; static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) { ipvlan->dev->mtu = dev->mtu; } static bool netif_is_ipvlan(const struct net_device *dev) { /* both ipvlan and ipvtap devices use the same netdev_ops */ return dev->netdev_ops == &ipvlan_netdev_ops; } static int ipvlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct ipvl_dev *ipvlan = netdev_priv(dev); return __ethtool_get_link_ksettings(ipvlan->phy_dev, cmd); } static void ipvlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver)); strscpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version)); } static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); return ipvlan->msg_enable; } static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value) { struct ipvl_dev *ipvlan = netdev_priv(dev); ipvlan->msg_enable = value; } static const struct ethtool_ops ipvlan_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = ipvlan_ethtool_get_link_ksettings, .get_drvinfo = ipvlan_ethtool_get_drvinfo, .get_msglevel = ipvlan_ethtool_get_msglevel, .set_msglevel = ipvlan_ethtool_set_msglevel, }; static int ipvlan_nl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); int err = 0; if (!data) return 0; if (!ns_capable(dev_net(ipvlan->phy_dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (data[IFLA_IPVLAN_MODE]) { u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); err = ipvlan_set_port_mode(port, nmode, extack); } if (!err && data[IFLA_IPVLAN_FLAGS]) { u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); if (flags & IPVLAN_F_PRIVATE) ipvlan_mark_private(port); else ipvlan_clear_private(port); if (flags & IPVLAN_F_VEPA) ipvlan_mark_vepa(port); else ipvlan_clear_vepa(port); } return err; } static size_t ipvlan_nl_getsize(const struct net_device *dev) { return (0 + nla_total_size(2) /* IFLA_IPVLAN_MODE */ + nla_total_size(2) /* IFLA_IPVLAN_FLAGS */ ); } static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (!data) return 0; if (data[IFLA_IPVLAN_MODE]) { u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); if (mode >= IPVLAN_MODE_MAX) return -EINVAL; } if (data[IFLA_IPVLAN_FLAGS]) { u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); /* Only two bits are used at this moment. */ if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) return -EINVAL; /* Also both flags can't be active at the same time. */ if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) == (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) return -EINVAL; } return 0; } static int ipvlan_nl_fillinfo(struct sk_buff *skb, const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); int ret = -EINVAL; if (!port) goto err; ret = -EMSGSIZE; if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode)) goto err; if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags)) goto err; return 0; err: return ret; } int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct ipvl_dev *ipvlan = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ipvl_port *port; struct net_device *phy_dev; int err; u16 mode = IPVLAN_MODE_L3; if (!tb[IFLA_LINK]) return -EINVAL; phy_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!phy_dev) return -ENODEV; if (netif_is_ipvlan(phy_dev)) { struct ipvl_dev *tmp = netdev_priv(phy_dev); phy_dev = tmp->phy_dev; if (!ns_capable(dev_net(phy_dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; } else if (!netif_is_ipvlan_port(phy_dev)) { /* Exit early if the underlying link is invalid or busy */ if (phy_dev->type != ARPHRD_ETHER || phy_dev->flags & IFF_LOOPBACK) { netdev_err(phy_dev, "Master is either lo or non-ether device\n"); return -EINVAL; } if (netdev_is_rx_handler_busy(phy_dev)) { netdev_err(phy_dev, "Device is already in use.\n"); return -EBUSY; } } ipvlan->phy_dev = phy_dev; ipvlan->dev = dev; ipvlan->sfeatures = IPVLAN_FEATURES; if (!tb[IFLA_MTU]) ipvlan_adjust_mtu(ipvlan, phy_dev); INIT_LIST_HEAD(&ipvlan->addrs); spin_lock_init(&ipvlan->addrs_lock); /* TODO Probably put random address here to be presented to the * world but keep using the physical-dev address for the outgoing * packets. */ eth_hw_addr_set(dev, phy_dev->dev_addr); dev->priv_flags |= IFF_NO_RX_HANDLER; err = register_netdevice(dev); if (err < 0) return err; /* ipvlan_init() would have created the port, if required */ port = ipvlan_port_get_rtnl(phy_dev); ipvlan->port = port; /* If the port-id base is at the MAX value, then wrap it around and * begin from 0x1 again. This may be due to a busy system where lots * of slaves are getting created and deleted. */ if (port->dev_id_start == 0xFFFE) port->dev_id_start = 0x1; /* Since L2 address is shared among all IPvlan slaves including * master, use unique 16 bit dev-ids to differentiate among them. * Assign IDs between 0x1 and 0xFFFE (used by the master) to each * slave link [see addrconf_ifid_eui48()]. */ err = ida_alloc_range(&port->ida, port->dev_id_start, 0xFFFD, GFP_KERNEL); if (err < 0) err = ida_alloc_range(&port->ida, 0x1, port->dev_id_start - 1, GFP_KERNEL); if (err < 0) goto unregister_netdev; dev->dev_id = err; /* Increment id-base to the next slot for the future assignment */ port->dev_id_start = err + 1; err = netdev_upper_dev_link(phy_dev, dev, extack); if (err) goto remove_ida; /* Flags are per port and latest update overrides. User has * to be consistent in setting it just like the mode attribute. */ if (data && data[IFLA_IPVLAN_FLAGS]) port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); if (data && data[IFLA_IPVLAN_MODE]) mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); err = ipvlan_set_port_mode(port, mode, extack); if (err) goto unlink_netdev; list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); netif_stacked_transfer_operstate(phy_dev, dev); return 0; unlink_netdev: netdev_upper_dev_unlink(phy_dev, dev); remove_ida: ida_free(&port->ida, dev->dev_id); unregister_netdev: unregister_netdevice(dev); return err; } EXPORT_SYMBOL_GPL(ipvlan_link_new); void ipvlan_link_delete(struct net_device *dev, struct list_head *head) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_addr *addr, *next; spin_lock_bh(&ipvlan->addrs_lock); list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { ipvlan_ht_addr_del(addr); list_del_rcu(&addr->anode); kfree_rcu(addr, rcu); } spin_unlock_bh(&ipvlan->addrs_lock); ida_free(&ipvlan->port->ida, dev->dev_id); list_del_rcu(&ipvlan->pnode); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(ipvlan->phy_dev, dev); } EXPORT_SYMBOL_GPL(ipvlan_link_delete); void ipvlan_link_setup(struct net_device *dev) { ether_setup(dev); dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE; dev->netdev_ops = &ipvlan_netdev_ops; dev->needs_free_netdev = true; dev->header_ops = &ipvlan_header_ops; dev->ethtool_ops = &ipvlan_ethtool_ops; } EXPORT_SYMBOL_GPL(ipvlan_link_setup); static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] = { [IFLA_IPVLAN_MODE] = { .type = NLA_U16 }, [IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 }, }; static struct net *ipvlan_get_link_net(const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); return dev_net(ipvlan->phy_dev); } static struct rtnl_link_ops ipvlan_link_ops = { .kind = "ipvlan", .priv_size = sizeof(struct ipvl_dev), .setup = ipvlan_link_setup, .newlink = ipvlan_link_new, .dellink = ipvlan_link_delete, .get_link_net = ipvlan_get_link_net, }; int ipvlan_link_register(struct rtnl_link_ops *ops) { ops->get_size = ipvlan_nl_getsize; ops->policy = ipvlan_nl_policy; ops->validate = ipvlan_nl_validate; ops->fill_info = ipvlan_nl_fillinfo; ops->changelink = ipvlan_nl_changelink; ops->maxtype = IFLA_IPVLAN_MAX; return rtnl_link_register(ops); } EXPORT_SYMBOL_GPL(ipvlan_link_register); static int ipvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct netdev_notifier_pre_changeaddr_info *prechaddr_info; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ipvl_dev *ipvlan, *next; struct ipvl_port *port; LIST_HEAD(lst_kill); int err; if (!netif_is_ipvlan_port(dev)) return NOTIFY_DONE; port = ipvlan_port_get_rtnl(dev); switch (event) { case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) netif_stacked_transfer_operstate(ipvlan->phy_dev, ipvlan->dev); break; case NETDEV_REGISTER: { struct net *oldnet, *newnet = dev_net(dev); oldnet = read_pnet(&port->pnet); if (net_eq(newnet, oldnet)) break; write_pnet(&port->pnet, newnet); if (port->mode == IPVLAN_MODE_L3S) ipvlan_migrate_l3s_hook(oldnet, newnet); break; } case NETDEV_UNREGISTER: if (dev->reg_state != NETREG_UNREGISTERING) break; list_for_each_entry_safe(ipvlan, next, &port->ipvlans, pnode) ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev, &lst_kill); unregister_netdevice_many(&lst_kill); break; case NETDEV_FEAT_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { netif_inherit_tso_max(ipvlan->dev, dev); netdev_update_features(ipvlan->dev); } break; case NETDEV_CHANGEMTU: list_for_each_entry(ipvlan, &port->ipvlans, pnode) ipvlan_adjust_mtu(ipvlan, dev); break; case NETDEV_PRE_CHANGEADDR: prechaddr_info = ptr; list_for_each_entry(ipvlan, &port->ipvlans, pnode) { err = dev_pre_changeaddr_notify(ipvlan->dev, prechaddr_info->dev_addr, extack); if (err) return notifier_from_errno(err); } break; case NETDEV_CHANGEADDR: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { eth_hw_addr_set(ipvlan->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev); } break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: list_for_each_entry(ipvlan, &port->ipvlans, pnode) call_netdevice_notifiers(event, ipvlan->dev); } return NOTIFY_DONE; } /* the caller must held the addrs lock */ static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) { struct ipvl_addr *addr; addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC); if (!addr) return -ENOMEM; addr->master = ipvlan; if (!is_v6) { memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr)); addr->atype = IPVL_IPV4; #if IS_ENABLED(CONFIG_IPV6) } else { memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr)); addr->atype = IPVL_IPV6; #endif } list_add_tail_rcu(&addr->anode, &ipvlan->addrs); /* If the interface is not up, the address will be added to the hash * list by ipvlan_open. */ if (netif_running(ipvlan->dev)) ipvlan_ht_addr_add(ipvlan, addr); return 0; } static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) { struct ipvl_addr *addr; spin_lock_bh(&ipvlan->addrs_lock); addr = ipvlan_find_addr(ipvlan, iaddr, is_v6); if (!addr) { spin_unlock_bh(&ipvlan->addrs_lock); return; } ipvlan_ht_addr_del(addr); list_del_rcu(&addr->anode); spin_unlock_bh(&ipvlan->addrs_lock); kfree_rcu(addr, rcu); } static bool ipvlan_is_valid_dev(const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); if (!netif_is_ipvlan(dev)) return false; if (!ipvlan || !ipvlan->port) return false; return true; } #if IS_ENABLED(CONFIG_IPV6) static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { int ret = -EINVAL; spin_lock_bh(&ipvlan->addrs_lock); if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true)) netif_err(ipvlan, ifup, ipvlan->dev, "Failed to add IPv6=%pI6c addr for %s intf\n", ip6_addr, ipvlan->dev->name); else ret = ipvlan_add_addr(ipvlan, ip6_addr, true); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { return ipvlan_del_addr(ipvlan, ip6_addr, true); } static int ipvlan_addr6_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr; struct net_device *dev = (struct net_device *)if6->idev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (ipvlan_add_addr6(ipvlan, &if6->addr)) return NOTIFY_BAD; break; case NETDEV_DOWN: ipvlan_del_addr6(ipvlan, &if6->addr); break; } return NOTIFY_OK; } static int ipvlan_addr6_validator_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr; struct net_device *dev = (struct net_device *)i6vi->i6vi_dev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) { NL_SET_ERR_MSG(i6vi->extack, "Address already assigned to an ipvlan device"); return notifier_from_errno(-EADDRINUSE); } break; } return NOTIFY_OK; } #endif static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) { int ret = -EINVAL; spin_lock_bh(&ipvlan->addrs_lock); if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false)) netif_err(ipvlan, ifup, ipvlan->dev, "Failed to add IPv4=%pI4 on %s intf.\n", ip4_addr, ipvlan->dev->name); else ret = ipvlan_add_addr(ipvlan, ip4_addr, false); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) { return ipvlan_del_addr(ipvlan, ip4_addr, false); } static int ipvlan_addr4_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct in_ifaddr *if4 = (struct in_ifaddr *)ptr; struct net_device *dev = (struct net_device *)if4->ifa_dev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); struct in_addr ip4_addr; if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: ip4_addr.s_addr = if4->ifa_address; if (ipvlan_add_addr4(ipvlan, &ip4_addr)) return NOTIFY_BAD; break; case NETDEV_DOWN: ip4_addr.s_addr = if4->ifa_address; ipvlan_del_addr4(ipvlan, &ip4_addr); break; } return NOTIFY_OK; } static int ipvlan_addr4_validator_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct in_validator_info *ivi = (struct in_validator_info *)ptr; struct net_device *dev = (struct net_device *)ivi->ivi_dev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) { NL_SET_ERR_MSG(ivi->extack, "Address already assigned to an ipvlan device"); return notifier_from_errno(-EADDRINUSE); } break; } return NOTIFY_OK; } static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = { .notifier_call = ipvlan_addr4_event, }; static struct notifier_block ipvlan_addr4_vtor_notifier_block __read_mostly = { .notifier_call = ipvlan_addr4_validator_event, }; static struct notifier_block ipvlan_notifier_block __read_mostly = { .notifier_call = ipvlan_device_event, }; #if IS_ENABLED(CONFIG_IPV6) static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = { .notifier_call = ipvlan_addr6_event, }; static struct notifier_block ipvlan_addr6_vtor_notifier_block __read_mostly = { .notifier_call = ipvlan_addr6_validator_event, }; #endif static int __init ipvlan_init_module(void) { int err; ipvlan_init_secret(); register_netdevice_notifier(&ipvlan_notifier_block); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&ipvlan_addr6_notifier_block); register_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif register_inetaddr_notifier(&ipvlan_addr4_notifier_block); register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block); err = ipvlan_l3s_init(); if (err < 0) goto error; err = ipvlan_link_register(&ipvlan_link_ops); if (err < 0) { ipvlan_l3s_cleanup(); goto error; } return 0; error: unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); unregister_inetaddr_validator_notifier( &ipvlan_addr4_vtor_notifier_block); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); unregister_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif unregister_netdevice_notifier(&ipvlan_notifier_block); return err; } static void __exit ipvlan_cleanup_module(void) { rtnl_link_unregister(&ipvlan_link_ops); ipvlan_l3s_cleanup(); unregister_netdevice_notifier(&ipvlan_notifier_block); unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); unregister_inetaddr_validator_notifier( &ipvlan_addr4_vtor_notifier_block); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); unregister_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif } module_init(ipvlan_init_module); module_exit(ipvlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>"); MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs"); MODULE_ALIAS_RTNL_LINK("ipvlan");
6 1 1 1 1 1 1 1 1 1 1 2 2 44 1 4 4 1 1 1 1 55 55 55 55 55 55 55 35 35 24 24 24 1 22 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_drr.c Deficit Round Robin scheduler * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct drr_class { struct Qdisc_class_common common; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct list_head alist; struct Qdisc *qdisc; u32 quantum; u32 deficit; }; struct drr_sched { struct list_head active; struct tcf_proto __rcu *filter_list; struct tcf_block *block; struct Qdisc_class_hash clhash; }; static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) { struct drr_sched *q = qdisc_priv(sch); struct Qdisc_class_common *clc; clc = qdisc_class_find(&q->clhash, classid); if (clc == NULL) return NULL; return container_of(clc, struct drr_class, common); } static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = { [TCA_DRR_QUANTUM] = { .type = NLA_U32 }, }; static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)*arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_DRR_MAX + 1]; u32 quantum; int err; if (!opt) { NL_SET_ERR_MSG(extack, "DRR options are required for this operation"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_DRR_MAX, opt, drr_policy, extack); if (err < 0) return err; if (tb[TCA_DRR_QUANTUM]) { quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]); if (quantum == 0) { NL_SET_ERR_MSG(extack, "Specified DRR quantum cannot be zero"); return -EINVAL; } } else quantum = psched_mtu(qdisc_dev(sch)); if (cl != NULL) { if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); return err; } } sch_tree_lock(sch); if (tb[TCA_DRR_QUANTUM]) cl->quantum = quantum; sch_tree_unlock(sch); return 0; } cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL); if (cl == NULL) return -ENOBUFS; gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, NULL); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; else qdisc_hash_add(cl->qdisc, true); if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); qdisc_put(cl->qdisc); kfree(cl); return err; } } sch_tree_lock(sch); qdisc_class_hash_insert(&q->clhash, &cl->common); sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; } static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) { gen_kill_estimator(&cl->rate_est); qdisc_put(cl->qdisc); kfree(cl); } static int drr_delete_class(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)arg; if (qdisc_class_in_use(&cl->common)) { NL_SET_ERR_MSG(extack, "DRR class is in use"); return -EBUSY; } sch_tree_lock(sch); qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); sch_tree_unlock(sch); drr_destroy_class(sch, cl); return 0; } static unsigned long drr_search_class(struct Qdisc *sch, u32 classid) { return (unsigned long)drr_find_class(sch, classid); } static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); if (cl) { NL_SET_ERR_MSG(extack, "DRR classid must be zero"); return NULL; } return q->block; } static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) { struct drr_class *cl = drr_find_class(sch, classid); if (cl) qdisc_class_get(&cl->common); return (unsigned long)cl; } static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; qdisc_class_put(&cl->common); } static int drr_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct drr_class *cl = (struct drr_class *)arg; if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->common.classid, NULL); if (new == NULL) new = &noop_qdisc; } *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; return cl->qdisc; } static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; list_del(&cl->alist); } static int drr_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct drr_class *cl = (struct drr_class *)arg; struct nlattr *nest; tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle = cl->common.classid; tcm->tcm_info = cl->qdisc->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct drr_class *cl = (struct drr_class *)arg; __u32 qlen = qdisc_qlen_sum(cl->qdisc); struct Qdisc *cl_q = cl->qdisc; struct tc_drr_stats xstats; memset(&xstats, 0, sizeof(xstats)); if (qlen) xstats.deficit = cl->deficit; if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; } } } static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct tcf_result res; struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { cl = drr_find_class(sch, skb->priority); if (cl != NULL) return cl; } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif cl = (struct drr_class *)res.class; if (cl == NULL) cl = drr_find_class(sch, res.classid); return cl; } return NULL; } static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; int err = 0; bool first; cl = drr_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return err; } first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; qdisc_qstats_drop(sch); } return err; } if (first) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } sch->qstats.backlog += len; sch->q.qlen++; return err; } static struct sk_buff *drr_dequeue(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct sk_buff *skb; unsigned int len; if (list_empty(&q->active)) goto out; while (1) { cl = list_first_entry(&q->active, struct drr_class, alist); skb = cl->qdisc->ops->peek(cl->qdisc); if (skb == NULL) { qdisc_warn_nonwc(__func__, cl->qdisc); goto out; } len = qdisc_pkt_len(skb); if (len <= cl->deficit) { cl->deficit -= len; skb = qdisc_dequeue_peeked(cl->qdisc); if (unlikely(skb == NULL)) goto out; if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } cl->deficit += cl->quantum; list_move_tail(&cl->alist, &q->active); } out: return NULL; } static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); int err; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; INIT_LIST_HEAD(&q->active); return 0; } static void drr_reset_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen) list_del(&cl->alist); qdisc_reset(cl->qdisc); } } } static void drr_destroy_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct hlist_node *next; unsigned int i; tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) drr_destroy_class(sch, cl); } qdisc_class_hash_destroy(&q->clhash); } static const struct Qdisc_class_ops drr_class_ops = { .change = drr_change_class, .delete = drr_delete_class, .find = drr_search_class, .tcf_block = drr_tcf_block, .bind_tcf = drr_bind_tcf, .unbind_tcf = drr_unbind_tcf, .graft = drr_graft_class, .leaf = drr_class_leaf, .qlen_notify = drr_qlen_notify, .dump = drr_dump_class, .dump_stats = drr_dump_class_stats, .walk = drr_walk, }; static struct Qdisc_ops drr_qdisc_ops __read_mostly = { .cl_ops = &drr_class_ops, .id = "drr", .priv_size = sizeof(struct drr_sched), .enqueue = drr_enqueue, .dequeue = drr_dequeue, .peek = qdisc_peek_dequeued, .init = drr_init_qdisc, .reset = drr_reset_qdisc, .destroy = drr_destroy_qdisc, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("drr"); static int __init drr_init(void) { return register_qdisc(&drr_qdisc_ops); } static void __exit drr_exit(void) { unregister_qdisc(&drr_qdisc_ops); } module_init(drr_init); module_exit(drr_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Deficit Round Robin scheduler");
5 5 5 5 1429 161 619 624 620 51 6 3 505 507 501 2 34 885 886 128 1 871 126 2 884 890 130 874 128 2 3 3 12 12 12 12 1300 1287 79 1 658 654 1 20 705 704 686 679 48 6 711 701 49 6 5 2 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/export.h> #include <linux/init.h> #include <linux/udp.h> #include <linux/tcp.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <net/sctp/checksum.h> #include <linux/netfilter.h> #include <net/netfilter/nf_nat.h> #include <linux/ipv6.h> #include <linux/netfilter_ipv6.h> #include <net/checksum.h> #include <net/ip6_checksum.h> #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/ipv6.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack.h> #include <linux/netfilter/nfnetlink_conntrack.h> static void nf_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype); static void __udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, struct udphdr *hdr, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, bool do_csum) { __be16 *portptr, newport; if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.udp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.udp.port; portptr = &hdr->dest; } if (do_csum) { nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; } *portptr = newport; } static bool udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct udphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check); return true; } static bool udplite_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_UDPLITE struct udphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, true); #endif return true; } static bool sctp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_SCTP struct sctphdr *hdr; int hdrsize = 8; /* This could be an inner header returned in imcp packet; in such * cases we cannot update the checksum field since it is outside * of the 8 bytes of transport layer headers we are guaranteed. */ if (skb->len >= hdroff + sizeof(*hdr)) hdrsize = sizeof(*hdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ hdr->source = tuple->src.u.sctp.port; } else { /* Get rid of dst port */ hdr->dest = tuple->dst.u.sctp.port; } if (hdrsize < sizeof(*hdr)) return true; if (skb->ip_summed != CHECKSUM_PARTIAL) { hdr->checksum = sctp_compute_cksum(skb, hdroff); skb->ip_summed = CHECKSUM_NONE; } #endif return true; } static bool tcp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct tcphdr *hdr; __be16 *portptr, newport, oldport; int hdrsize = 8; /* TCP connection tracking guarantees this much */ /* this could be a inner header returned in icmp packet; in such cases we cannot update the checksum field since it is outside of the 8 bytes of transport layer headers we are guaranteed */ if (skb->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct tcphdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.tcp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.tcp.port; portptr = &hdr->dest; } oldport = *portptr; *portptr = newport; if (hdrsize < sizeof(*hdr)) return true; nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false); return true; } static bool dccp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_DCCP struct dccp_hdr *hdr; __be16 *portptr, oldport, newport; int hdrsize = 8; /* DCCP connection tracking guarantees this much */ if (skb->len >= hdroff + sizeof(struct dccp_hdr)) hdrsize = sizeof(struct dccp_hdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct dccp_hdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { newport = tuple->src.u.dccp.port; portptr = &hdr->dccph_sport; } else { newport = tuple->dst.u.dccp.port; portptr = &hdr->dccph_dport; } oldport = *portptr; *portptr = newport; if (hdrsize < sizeof(*hdr)) return true; nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, false); #endif return true; } static bool icmp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); switch (hdr->type) { case ICMP_ECHO: case ICMP_ECHOREPLY: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMP_INFO_REQUEST: case ICMP_INFO_REPLY: case ICMP_ADDRESS: case ICMP_ADDRESSREPLY: break; default: return true; } inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; return true; } static bool icmpv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmp6hdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmp6hdr *)(skb->data + hdroff); nf_csum_update(skb, iphdroff, &hdr->icmp6_cksum, tuple, maniptype); if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST || hdr->icmp6_type == ICMPV6_ECHO_REPLY) { inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, hdr->icmp6_identifier, tuple->src.u.icmp.id, false); hdr->icmp6_identifier = tuple->src.u.icmp.id; } return true; } /* manipulate a GRE packet according to maniptype */ static bool gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) const struct gre_base_hdr *greh; struct pptp_gre_header *pgreh; /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8)) return false; greh = (void *)skb->data + hdroff; pgreh = (struct pptp_gre_header *)greh; /* we only have destination manip of a packet, since 'source key' * is not present in the packet itself */ if (maniptype != NF_NAT_MANIP_DST) return true; switch (greh->flags & GRE_VERSION) { case GRE_VERSION_0: /* We do not currently NAT any GREv0 packets. * Try to behave like "nf_nat_proto_unknown" */ break; case GRE_VERSION_1: pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); pgreh->call_id = tuple->dst.u.gre.key; break; default: pr_debug("can't nat unknown GRE version\n"); return false; } #endif return true; } static bool l4proto_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { switch (tuple->dst.protonum) { case IPPROTO_TCP: return tcp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_UDP: return udp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_UDPLITE: return udplite_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_SCTP: return sctp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_ICMP: return icmp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_ICMPV6: return icmpv6_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_DCCP: return dccp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_GRE: return gre_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); } /* If we don't know protocol -- no error, pass it unmodified. */ return true; } static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { struct iphdr *iph; unsigned int hdroff; if (skb_ensure_writable(skb, iphdroff + sizeof(*iph))) return false; iph = (void *)skb->data + iphdroff; hdroff = iphdroff + iph->ihl * 4; if (!l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype)) return false; iph = (void *)skb->data + iphdroff; if (maniptype == NF_NAT_MANIP_SRC) { csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); iph->saddr = target->src.u3.ip; } else { csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); iph->daddr = target->dst.u3.ip; } return true; } static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *ipv6h; __be16 frag_off; int hdroff; u8 nexthdr; if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h))) return false; ipv6h = (void *)skb->data + iphdroff; nexthdr = ipv6h->nexthdr; hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h), &nexthdr, &frag_off); if (hdroff < 0) goto manip_addr; if ((frag_off & htons(~0x7)) == 0 && !l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype)) return false; /* must reload, offset might have changed */ ipv6h = (void *)skb->data + iphdroff; manip_addr: if (maniptype == NF_NAT_MANIP_SRC) ipv6h->saddr = target->src.u3.in6; else ipv6h->daddr = target->dst.u3.in6; #endif return true; } unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir) { struct nf_conntrack_tuple target; /* We are aiming to look like inverse of other direction. */ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); switch (target.src.l3num) { case NFPROTO_IPV6: if (nf_nat_ipv6_manip_pkt(skb, 0, &target, mtype)) return NF_ACCEPT; break; case NFPROTO_IPV4: if (nf_nat_ipv4_manip_pkt(skb, 0, &target, mtype)) return NF_ACCEPT; break; default: WARN_ON_ONCE(1); break; } return NF_DROP; } static void nf_nat_ipv4_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); __be32 oldip, newip; if (maniptype == NF_NAT_MANIP_SRC) { oldip = iph->saddr; newip = t->src.u3.ip; } else { oldip = iph->daddr; newip = t->dst.u3.ip; } inet_proto_csum_replace4(check, skb, oldip, newip, true); } static void nf_nat_ipv6_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_IPV6) const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff); const struct in6_addr *oldip, *newip; if (maniptype == NF_NAT_MANIP_SRC) { oldip = &ipv6h->saddr; newip = &t->src.u3.in6; } else { oldip = &ipv6h->daddr; newip = &t->dst.u3.in6; } inet_proto_csum_replace16(check, skb, oldip->s6_addr32, newip->s6_addr32, true); #endif } static void nf_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { switch (t->src.l3num) { case NFPROTO_IPV4: nf_nat_ipv4_csum_update(skb, iphdroff, check, t, maniptype); return; case NFPROTO_IPV6: nf_nat_ipv6_csum_update(skb, iphdroff, check, t, maniptype); return; } } static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { if (skb->ip_summed != CHECKSUM_PARTIAL) { const struct iphdr *iph = ip_hdr(skb); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + ip_hdrlen(skb); skb->csum_offset = (void *)check - data; *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, proto, 0); } else { inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); } } #if IS_ENABLED(CONFIG_IPV6) static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { if (skb->ip_summed != CHECKSUM_PARTIAL) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + (data - (void *)skb->data); skb->csum_offset = (void *)check - data; *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, datalen, proto, 0); } else { inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); } } #endif void nf_nat_csum_recalc(struct sk_buff *skb, u8 nfproto, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { switch (nfproto) { case NFPROTO_IPV4: nf_nat_ipv4_csum_recalc(skb, proto, data, check, datalen, oldlen); return; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: nf_nat_ipv6_csum_recalc(skb, proto, data, check, datalen, oldlen); return; #endif } WARN_ON_ONCE(1); } int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum) { struct { struct icmphdr icmp; struct iphdr ip; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); unsigned int hdrlen = ip_hdrlen(skb); struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP)) return 0; inside = (void *)skb->data + hdrlen; if (inside->icmp.type == ICMP_REDIRECT) { if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; if (!(ct->status & statusbit)) return 1; if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt may reallocate */ inside = (void *)skb->data + hdrlen; inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } /* Change outer to look like the reply to an incoming packet */ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); target.dst.protonum = IPPROTO_ICMP; if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) return 0; return 1; } EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); static unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, state->hook)) return NF_DROP; else return NF_ACCEPT; } } return nf_nat_inet_fn(priv, skb, state); } static unsigned int nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret == NF_ACCEPT && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); return ret; } #ifdef CONFIG_XFRM static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) { struct sock *sk = skb->sk; struct dst_entry *dst; unsigned int hh_len; struct flowi fl; int err; err = xfrm_decode_session(net, skb, &fl, family); if (err < 0) return err; dst = skb_dst(skb); if (dst->xfrm) dst = ((struct xfrm_dst *)dst)->route; if (!dst_hold_safe(dst)) return -EHOSTUNREACH; if (sk && !net_eq(net, sock_net(sk))) sk = NULL; dst = xfrm_lookup(net, dst, &fl, sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); skb_dst_drop(skb); skb_dst_set(skb, dst); /* Change in oif may mean change in hh_len. */ hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) return -ENOMEM; return 0; } #endif static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport) { enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: case IPPROTO_UDP: break; default: return false; } dir = CTINFO2DIR(ctinfo); if (dir != IP_CT_DIR_ORIGINAL) return false; return ct->tuplehash[!dir].tuple.dst.u.all != sport; } static unsigned int nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { __be32 saddr = ip_hdr(skb)->saddr; struct sock *sk = skb->sk; unsigned int ret; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) return ret; /* skb has a socket assigned via tcp edemux. We need to check * if nf_nat_ipv4_fn() has mangled the packet in a way that * edemux would not have found this socket. * * This includes both changes to the source address and changes * to the source port, which are both handled by the * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux * might have found a socket for the old (pre-snat) address. */ if (saddr != ip_hdr(skb)->saddr || nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); /* TCP edemux obtained wrong socket */ return ret; } static unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; int err; #endif unsigned int ret; ret = nf_nat_ipv4_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_ACCEPT) return ret; if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.src.u3.ip != ct->tuplehash[!dir].tuple.dst.u3.ip || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } } #endif return ret; } static unsigned int nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } #endif } return ret; } static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_pre_routing, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv4_out, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_local_fn, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv4_local_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, }; int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn); void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn); #if IS_ENABLED(CONFIG_IPV6) int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen) { struct { struct icmp6hdr icmp6; struct ipv6hdr ip6; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) return 0; inside = (void *)skb->data + hdrlen; if (inside->icmp6.icmp6_type == NDISC_REDIRECT) { if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; if (!(ct->status & statusbit)) return 1; if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6), &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); inside = (void *)skb->data + hdrlen; inside->icmp6.icmp6_cksum = 0; inside->icmp6.icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len - hdrlen, IPPROTO_ICMPV6, skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); target.dst.protonum = IPPROTO_ICMPV6; if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip)) return 0; return 1; } EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); static unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be16 frag_off; int hdrlen; u8 nexthdr; ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct) return NF_ACCEPT; if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { nexthdr = ipv6_hdr(skb)->nexthdr; hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, state->hook, hdrlen)) return NF_DROP; else return NF_ACCEPT; } } return nf_nat_inet_fn(priv, skb, state); } static unsigned int nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct in6_addr saddr = ipv6_hdr(skb)->saddr; struct sock *sk = skb->sk; unsigned int ret; ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) return ret; /* see nf_nat_ipv4_local_in */ if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) || nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); return ret; } static unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret, verdict; struct in6_addr daddr = ipv6_hdr(skb)->daddr; ret = nf_nat_ipv6_fn(priv, skb, state); verdict = ret & NF_VERDICT_MASK; if (verdict != NF_DROP && verdict != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); return ret; } static unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; int err; #endif unsigned int ret; ret = nf_nat_ipv6_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_ACCEPT) return ret; if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3) || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } } #endif return ret; } static unsigned int nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { err = nf_ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } #endif } return ret; } static const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv6_out, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_local_fn, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv6_local_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, }, }; int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn); #endif /* CONFIG_IPV6 */ #if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT) int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops) { int ret; if (WARN_ON_ONCE(ops->pf != NFPROTO_INET)) return -EINVAL; ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); if (ret) return ret; ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); if (ret) nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); return ret; } EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn); void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn); #endif /* NFT INET NAT */
6 6 2 4 4 4 4 4 4 4 2 3 4 4 4 4 4 4 28 28 3 22 2 15 5 4 14 5 3 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV4 GSO/GRO offload support * Linux INET implementation * * GRE GSO support */ #include <linux/skbuff.h> #include <linux/init.h> #include <net/protocol.h> #include <net/gre.h> #include <net/gro.h> #include <net/gso.h> static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); bool need_csum, offload_csum, gso_partial, need_ipsec; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; if (!skb->encapsulation) goto out; if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr))) goto out; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; /* setup inner skb. */ skb->encapsulation = 0; SKB_GSO_CB(skb)->encap_level = 0; __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = skb->inner_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); skb->encap_hdr_csum = need_csum; features &= skb->dev->hw_enc_features; if (need_csum) features &= ~NETIF_F_SCTP_CRC; need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); /* Try to offload checksum if possible */ offload_csum = !!(need_csum && !need_ipsec && (skb->dev->features & NETIF_F_HW_CSUM)); /* segment inner packet. */ segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); goto out; } gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); outer_hlen = skb_tnl_header_len(skb); gre_offset = outer_hlen - tnl_hlen; skb = segs; do { struct gre_base_hdr *greh; __sum16 *pcsum; /* Set up inner headers if we are offloading inner checksum */ if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } skb->mac_len = mac_len; skb->protocol = protocol; __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, gre_offset); if (!need_csum) continue; greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); if (gso_partial && skb_is_gso(skb)) { unsigned int partial_adj; /* Adjust checksum to account for the fact that * the partial checksum is based on actual size * whereas headers should be based on MSS size. */ partial_adj = skb->len + skb_headroom(skb) - SKB_GSO_CB(skb)->data_offset - skb_shinfo(skb)->gso_size; *pcsum = ~csum_fold((__force __wsum)htonl(partial_adj)); } else { *pcsum = 0; } *(pcsum + 1) = 0; if (skb->encapsulation || !offload_csum) { *pcsum = gso_make_checksum(skb, 0); } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = sizeof(*greh); } } while ((skb = skb->next)); out: return segs; } static struct sk_buff *gre_gro_receive(struct list_head *head, struct sk_buff *skb) { struct sk_buff *pp = NULL; struct sk_buff *p; const struct gre_base_hdr *greh; unsigned int hlen, grehlen; unsigned int off; int flush = 1; struct packet_offload *ptype; __be16 type; if (NAPI_GRO_CB(skb)->encap_mark) goto out; NAPI_GRO_CB(skb)->encap_mark = 1; off = skb_gro_offset(skb); hlen = off + sizeof(*greh); greh = skb_gro_header(skb, hlen, off); if (unlikely(!greh)) goto out; /* Only support version 0 and K (key), C (csum) flags. Note that * although the support for the S (seq#) flag can be added easily * for GRO, this is problematic for GSO hence can not be enabled * here because a GRO pkt may end up in the forwarding path, thus * requiring GSO support to break it up correctly. */ if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) goto out; /* We can only support GRE_CSUM if we can track the location of * the GRE header. In the case of FOU/GUE we cannot because the * outer UDP header displaces the GRE header leaving us in a state * of limbo. */ if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou) goto out; type = greh->protocol; ptype = gro_find_receive_by_type(type); if (!ptype) goto out; grehlen = GRE_HEADER_SECTION; if (greh->flags & GRE_KEY) grehlen += GRE_HEADER_SECTION; if (greh->flags & GRE_CSUM) grehlen += GRE_HEADER_SECTION; hlen = off + grehlen; if (!skb_gro_may_pull(skb, hlen)) { greh = skb_gro_header_slow(skb, hlen, off); if (unlikely(!greh)) goto out; } /* Don't bother verifying checksum if we're going to flush anyway. */ if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) { if (skb_gro_checksum_simple_validate(skb)) goto out; skb_gro_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } list_for_each_entry(p, head, list) { const struct gre_base_hdr *greh2; if (!NAPI_GRO_CB(p)->same_flow) continue; /* The following checks are needed to ensure only pkts * from the same tunnel are considered for aggregation. * The criteria for "the same tunnel" includes: * 1) same version (we only support version 0 here) * 2) same protocol (we only support ETH_P_IP for now) * 3) same set of flags * 4) same key if the key field is present. */ greh2 = (struct gre_base_hdr *)(p->data + off); if (greh2->flags != greh->flags || greh2->protocol != greh->protocol) { NAPI_GRO_CB(p)->same_flow = 0; continue; } if (greh->flags & GRE_KEY) { /* compare keys */ if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } } skb_gro_pull(skb, grehlen); /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ skb_gro_postpull_rcsum(skb, greh, grehlen); pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out: skb_gro_flush_final(skb, pp, flush); return pp; } static int gre_gro_complete(struct sk_buff *skb, int nhoff) { struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); struct packet_offload *ptype; unsigned int grehlen = sizeof(*greh); int err = -ENOENT; __be16 type; skb->encapsulation = 1; skb_shinfo(skb)->gso_type = SKB_GSO_GRE; type = greh->protocol; if (greh->flags & GRE_KEY) grehlen += GRE_HEADER_SECTION; if (greh->flags & GRE_CSUM) grehlen += GRE_HEADER_SECTION; ptype = gro_find_complete_by_type(type); if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); skb_set_inner_mac_header(skb, nhoff + grehlen); return err; } static const struct net_offload gre_offload = { .callbacks = { .gso_segment = gre_gso_segment, .gro_receive = gre_gro_receive, .gro_complete = gre_gro_complete, }, }; static int __init gre_offload_init(void) { int err; err = inet_add_offload(&gre_offload, IPPROTO_GRE); #if IS_ENABLED(CONFIG_IPV6) if (err) return err; err = inet6_add_offload(&gre_offload, IPPROTO_GRE); if (err) inet_del_offload(&gre_offload, IPPROTO_GRE); #endif return err; } device_initcall(gre_offload_init);
33 34 7 7 5 1 7 2 2 2 1 5 7 6 6 2 18 18 8 1 23 21 1 3 3 1 13 18 18 13 9 1 13 7 18 17 18 18 18 18 18 18 18 18 18 6 11 18 18 18 18 17 18 18 18 18 18 18 18 2 18 2 2 2 2 2 2 27 1 19 8 12 2 18 17 18 15 2 2 2 3 9 7 7 5 5 4 1 4 1 1 2 7 27 1 27 1 1 1 18 18 10 13 2 8 4 5 4 5 5 4 1 1 2 4 4 2 7 7 2 5 2 7 7 7 7 2 5 4 6 6 7 7 1 7 7 7 7 16 16 16 18 18 2 18 18 16 2 1 2 17 17 1 16 1 16 15 16 2 14 16 9 7 7 2 5 7 7 15 15 15 2 2 2 2 10 10 10 10 10 20 3 9 10 3 3 7 6 6 7 7 7 5 2 7 7 7 3 3 3 3 3 2 2 2 1 28 1 1 28 15 28 31 43 15 28 2 43 2 42 2 15 28 7 21 28 43 43 43 42 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 /* * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * Copyright (c) 2016-2017, Lance Chao <lancerchao@fb.com>. All rights reserved. * Copyright (c) 2016, Fridolin Pokorny <fridolin.pokorny@gmail.com>. All rights reserved. * Copyright (c) 2016, Nikos Mavrogiannopoulos <nmav@gnutls.org>. All rights reserved. * Copyright (c) 2018, Covalent IO, Inc. http://covalent.io * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/bug.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/splice.h> #include <crypto/aead.h> #include <net/strparser.h> #include <net/tls.h> #include <trace/events/sock.h> #include "tls.h" struct tls_decrypt_arg { struct_group(inargs, bool zc; bool async; bool async_done; u8 tail; ); struct sk_buff *skb; }; struct tls_decrypt_ctx { struct sock *sk; u8 iv[TLS_MAX_IV_SIZE]; u8 aad[TLS_MAX_AAD_SIZE]; u8 tail; bool free_sgout; struct scatterlist sg[]; }; noinline void tls_err_abort(struct sock *sk, int err) { WARN_ON_ONCE(err >= 0); /* sk->sk_err should contain a positive error code. */ WRITE_ONCE(sk->sk_err, -err); /* Paired with smp_rmb() in tcp_poll() */ smp_wmb(); sk_error_report(sk); } static int __skb_nsg(struct sk_buff *skb, int offset, int len, unsigned int recursion_level) { int start = skb_headlen(skb); int i, chunk = start - offset; struct sk_buff *frag_iter; int elt = 0; if (unlikely(recursion_level >= 24)) return -EMSGSIZE; if (chunk > 0) { if (chunk > len) chunk = len; elt++; len -= chunk; if (len == 0) return elt; offset += chunk; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; WARN_ON(start > offset + len); end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); chunk = end - offset; if (chunk > 0) { if (chunk > len) chunk = len; elt++; len -= chunk; if (len == 0) return elt; offset += chunk; } start = end; } if (unlikely(skb_has_frag_list(skb))) { skb_walk_frags(skb, frag_iter) { int end, ret; WARN_ON(start > offset + len); end = start + frag_iter->len; chunk = end - offset; if (chunk > 0) { if (chunk > len) chunk = len; ret = __skb_nsg(frag_iter, offset - start, chunk, recursion_level + 1); if (unlikely(ret < 0)) return ret; elt += ret; len -= chunk; if (len == 0) return elt; offset += chunk; } start = end; } } BUG_ON(len); return elt; } /* Return the number of scatterlist elements required to completely map the * skb, or -EMSGSIZE if the recursion depth is exceeded. */ static int skb_nsg(struct sk_buff *skb, int offset, int len) { return __skb_nsg(skb, offset, len, 0); } static int tls_padding_length(struct tls_prot_info *prot, struct sk_buff *skb, struct tls_decrypt_arg *darg) { struct strp_msg *rxm = strp_msg(skb); struct tls_msg *tlm = tls_msg(skb); int sub = 0; /* Determine zero-padding length */ if (prot->version == TLS_1_3_VERSION) { int offset = rxm->full_len - TLS_TAG_SIZE - 1; char content_type = darg->zc ? darg->tail : 0; int err; while (content_type == 0) { if (offset < prot->prepend_size) return -EBADMSG; err = skb_copy_bits(skb, rxm->offset + offset, &content_type, 1); if (err) return err; if (content_type) break; sub++; offset--; } tlm->control = content_type; } return sub; } static void tls_decrypt_done(void *data, int err) { struct aead_request *aead_req = data; struct crypto_aead *aead = crypto_aead_reqtfm(aead_req); struct scatterlist *sgout = aead_req->dst; struct tls_sw_context_rx *ctx; struct tls_decrypt_ctx *dctx; struct tls_context *tls_ctx; struct scatterlist *sg; unsigned int pages; struct sock *sk; int aead_size; /* If requests get too backlogged crypto API returns -EBUSY and calls * ->complete(-EINPROGRESS) immediately followed by ->complete(0) * to make waiting for backlog to flush with crypto_wait_req() easier. * First wait converts -EBUSY -> -EINPROGRESS, and the second one * -EINPROGRESS -> 0. * We have a single struct crypto_async_request per direction, this * scheme doesn't help us, so just ignore the first ->complete(). */ if (err == -EINPROGRESS) return; aead_size = sizeof(*aead_req) + crypto_aead_reqsize(aead); aead_size = ALIGN(aead_size, __alignof__(*dctx)); dctx = (void *)((u8 *)aead_req + aead_size); sk = dctx->sk; tls_ctx = tls_get_ctx(sk); ctx = tls_sw_ctx_rx(tls_ctx); /* Propagate if there was an err */ if (err) { if (err == -EBADMSG) TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR); ctx->async_wait.err = err; tls_err_abort(sk, err); } /* Free the destination pages if skb was not decrypted inplace */ if (dctx->free_sgout) { /* Skip the first S/G entry as it points to AAD */ for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) { if (!sg) break; put_page(sg_page(sg)); } } kfree(aead_req); if (atomic_dec_and_test(&ctx->decrypt_pending)) complete(&ctx->async_wait.completion); } static int tls_decrypt_async_wait(struct tls_sw_context_rx *ctx) { if (!atomic_dec_and_test(&ctx->decrypt_pending)) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); atomic_inc(&ctx->decrypt_pending); return ctx->async_wait.err; } static int tls_do_decryption(struct sock *sk, struct scatterlist *sgin, struct scatterlist *sgout, char *iv_recv, size_t data_len, struct aead_request *aead_req, struct tls_decrypt_arg *darg) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); int ret; aead_request_set_tfm(aead_req, ctx->aead_recv); aead_request_set_ad(aead_req, prot->aad_size); aead_request_set_crypt(aead_req, sgin, sgout, data_len + prot->tag_size, (u8 *)iv_recv); if (darg->async) { aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, tls_decrypt_done, aead_req); DEBUG_NET_WARN_ON_ONCE(atomic_read(&ctx->decrypt_pending) < 1); atomic_inc(&ctx->decrypt_pending); } else { DECLARE_CRYPTO_WAIT(wait); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &wait); ret = crypto_aead_decrypt(aead_req); if (ret == -EINPROGRESS || ret == -EBUSY) ret = crypto_wait_req(ret, &wait); return ret; } ret = crypto_aead_decrypt(aead_req); if (ret == -EINPROGRESS) return 0; if (ret == -EBUSY) { ret = tls_decrypt_async_wait(ctx); darg->async_done = true; /* all completions have run, we're not doing async anymore */ darg->async = false; return ret; } atomic_dec(&ctx->decrypt_pending); darg->async = false; return ret; } static void tls_trim_both_msgs(struct sock *sk, int target_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; sk_msg_trim(sk, &rec->msg_plaintext, target_size); if (target_size > 0) target_size += prot->overhead_size; sk_msg_trim(sk, &rec->msg_encrypted, target_size); } static int tls_alloc_encrypted_msg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; struct sk_msg *msg_en = &rec->msg_encrypted; return sk_msg_alloc(sk, msg_en, len, 0); } static int tls_clone_plaintext_msg(struct sock *sk, int required) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; struct sk_msg *msg_pl = &rec->msg_plaintext; struct sk_msg *msg_en = &rec->msg_encrypted; int skip, len; /* We add page references worth len bytes from encrypted sg * at the end of plaintext sg. It is guaranteed that msg_en * has enough required room (ensured by caller). */ len = required - msg_pl->sg.size; /* Skip initial bytes in msg_en's data to be able to use * same offset of both plain and encrypted data. */ skip = prot->prepend_size + msg_pl->sg.size; return sk_msg_clone(sk, msg_pl, msg_en, skip, len); } static struct tls_rec *tls_get_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct sk_msg *msg_pl, *msg_en; struct tls_rec *rec; int mem_size; mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); rec = kzalloc(mem_size, sk->sk_allocation); if (!rec) return NULL; msg_pl = &rec->msg_plaintext; msg_en = &rec->msg_encrypted; sk_msg_init(msg_pl); sk_msg_init(msg_en); sg_init_table(rec->sg_aead_in, 2); sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, prot->aad_size); sg_unmark_end(&rec->sg_aead_in[1]); sg_init_table(rec->sg_aead_out, 2); sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, prot->aad_size); sg_unmark_end(&rec->sg_aead_out[1]); rec->sk = sk; return rec; } static void tls_free_rec(struct sock *sk, struct tls_rec *rec) { sk_msg_free(sk, &rec->msg_encrypted); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } static void tls_free_open_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; if (rec) { tls_free_rec(sk, rec); ctx->open_rec = NULL; } } int tls_tx_records(struct sock *sk, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec, *tmp; struct sk_msg *msg_en; int tx_flags, rc = 0; if (tls_is_partially_sent_record(tls_ctx)) { rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); if (flags == -1) tx_flags = rec->tx_flags; else tx_flags = flags; rc = tls_push_partial_record(sk, tls_ctx, tx_flags); if (rc) goto tx_err; /* Full record has been transmitted. * Remove the head of tx_list */ list_del(&rec->list); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } /* Tx all ready records */ list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { if (READ_ONCE(rec->tx_ready)) { if (flags == -1) tx_flags = rec->tx_flags; else tx_flags = flags; msg_en = &rec->msg_encrypted; rc = tls_push_sg(sk, tls_ctx, &msg_en->sg.data[msg_en->sg.curr], 0, tx_flags); if (rc) goto tx_err; list_del(&rec->list); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } else { break; } } tx_err: if (rc < 0 && rc != -EAGAIN) tls_err_abort(sk, rc); return rc; } static void tls_encrypt_done(void *data, int err) { struct tls_sw_context_tx *ctx; struct tls_context *tls_ctx; struct tls_prot_info *prot; struct tls_rec *rec = data; struct scatterlist *sge; struct sk_msg *msg_en; struct sock *sk; if (err == -EINPROGRESS) /* see the comment in tls_decrypt_done() */ return; msg_en = &rec->msg_encrypted; sk = rec->sk; tls_ctx = tls_get_ctx(sk); prot = &tls_ctx->prot_info; ctx = tls_sw_ctx_tx(tls_ctx); sge = sk_msg_elem(msg_en, msg_en->sg.curr); sge->offset -= prot->prepend_size; sge->length += prot->prepend_size; /* Check if error is previously set on socket */ if (err || sk->sk_err) { rec = NULL; /* If err is already set on socket, return the same code */ if (sk->sk_err) { ctx->async_wait.err = -sk->sk_err; } else { ctx->async_wait.err = err; tls_err_abort(sk, err); } } if (rec) { struct tls_rec *first_rec; /* Mark the record as ready for transmission */ smp_store_mb(rec->tx_ready, true); /* If received record is at head of tx_list, schedule tx */ first_rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); if (rec == first_rec) { /* Schedule the transmission */ if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) schedule_delayed_work(&ctx->tx_work.work, 1); } } if (atomic_dec_and_test(&ctx->encrypt_pending)) complete(&ctx->async_wait.completion); } static int tls_encrypt_async_wait(struct tls_sw_context_tx *ctx) { if (!atomic_dec_and_test(&ctx->encrypt_pending)) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); atomic_inc(&ctx->encrypt_pending); return ctx->async_wait.err; } static int tls_do_encryption(struct sock *sk, struct tls_context *tls_ctx, struct tls_sw_context_tx *ctx, struct aead_request *aead_req, size_t data_len, u32 start) { struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_rec *rec = ctx->open_rec; struct sk_msg *msg_en = &rec->msg_encrypted; struct scatterlist *sge = sk_msg_elem(msg_en, start); int rc, iv_offset = 0; /* For CCM based ciphers, first byte of IV is a constant */ switch (prot->cipher_type) { case TLS_CIPHER_AES_CCM_128: rec->iv_data[0] = TLS_AES_CCM_IV_B0_BYTE; iv_offset = 1; break; case TLS_CIPHER_SM4_CCM: rec->iv_data[0] = TLS_SM4_CCM_IV_B0_BYTE; iv_offset = 1; break; } memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv, prot->iv_size + prot->salt_size); tls_xor_iv_with_seq(prot, rec->iv_data + iv_offset, tls_ctx->tx.rec_seq); sge->offset += prot->prepend_size; sge->length -= prot->prepend_size; msg_en->sg.curr = start; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, prot->aad_size); aead_request_set_crypt(aead_req, rec->sg_aead_in, rec->sg_aead_out, data_len, rec->iv_data); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, tls_encrypt_done, rec); /* Add the record in tx_list */ list_add_tail((struct list_head *)&rec->list, &ctx->tx_list); DEBUG_NET_WARN_ON_ONCE(atomic_read(&ctx->encrypt_pending) < 1); atomic_inc(&ctx->encrypt_pending); rc = crypto_aead_encrypt(aead_req); if (rc == -EBUSY) { rc = tls_encrypt_async_wait(ctx); rc = rc ?: -EINPROGRESS; } if (!rc || rc != -EINPROGRESS) { atomic_dec(&ctx->encrypt_pending); sge->offset -= prot->prepend_size; sge->length += prot->prepend_size; } if (!rc) { WRITE_ONCE(rec->tx_ready, true); } else if (rc != -EINPROGRESS) { list_del(&rec->list); return rc; } /* Unhook the record from context if encryption is not failure */ ctx->open_rec = NULL; tls_advance_record_sn(sk, prot, &tls_ctx->tx); return rc; } static int tls_split_open_record(struct sock *sk, struct tls_rec *from, struct tls_rec **to, struct sk_msg *msg_opl, struct sk_msg *msg_oen, u32 split_point, u32 tx_overhead_size, u32 *orig_end) { u32 i, j, bytes = 0, apply = msg_opl->apply_bytes; struct scatterlist *sge, *osge, *nsge; u32 orig_size = msg_opl->sg.size; struct scatterlist tmp = { }; struct sk_msg *msg_npl; struct tls_rec *new; int ret; new = tls_get_rec(sk); if (!new) return -ENOMEM; ret = sk_msg_alloc(sk, &new->msg_encrypted, msg_opl->sg.size + tx_overhead_size, 0); if (ret < 0) { tls_free_rec(sk, new); return ret; } *orig_end = msg_opl->sg.end; i = msg_opl->sg.start; sge = sk_msg_elem(msg_opl, i); while (apply && sge->length) { if (sge->length > apply) { u32 len = sge->length - apply; get_page(sg_page(sge)); sg_set_page(&tmp, sg_page(sge), len, sge->offset + apply); sge->length = apply; bytes += apply; apply = 0; } else { apply -= sge->length; bytes += sge->length; } sk_msg_iter_var_next(i); if (i == msg_opl->sg.end) break; sge = sk_msg_elem(msg_opl, i); } msg_opl->sg.end = i; msg_opl->sg.curr = i; msg_opl->sg.copybreak = 0; msg_opl->apply_bytes = 0; msg_opl->sg.size = bytes; msg_npl = &new->msg_plaintext; msg_npl->apply_bytes = apply; msg_npl->sg.size = orig_size - bytes; j = msg_npl->sg.start; nsge = sk_msg_elem(msg_npl, j); if (tmp.length) { memcpy(nsge, &tmp, sizeof(*nsge)); sk_msg_iter_var_next(j); nsge = sk_msg_elem(msg_npl, j); } osge = sk_msg_elem(msg_opl, i); while (osge->length) { memcpy(nsge, osge, sizeof(*nsge)); sg_unmark_end(nsge); sk_msg_iter_var_next(i); sk_msg_iter_var_next(j); if (i == *orig_end) break; osge = sk_msg_elem(msg_opl, i); nsge = sk_msg_elem(msg_npl, j); } msg_npl->sg.end = j; msg_npl->sg.curr = j; msg_npl->sg.copybreak = 0; *to = new; return 0; } static void tls_merge_open_record(struct sock *sk, struct tls_rec *to, struct tls_rec *from, u32 orig_end) { struct sk_msg *msg_npl = &from->msg_plaintext; struct sk_msg *msg_opl = &to->msg_plaintext; struct scatterlist *osge, *nsge; u32 i, j; i = msg_opl->sg.end; sk_msg_iter_var_prev(i); j = msg_npl->sg.start; osge = sk_msg_elem(msg_opl, i); nsge = sk_msg_elem(msg_npl, j); if (sg_page(osge) == sg_page(nsge) && osge->offset + osge->length == nsge->offset) { osge->length += nsge->length; put_page(sg_page(nsge)); } msg_opl->sg.end = orig_end; msg_opl->sg.curr = orig_end; msg_opl->sg.copybreak = 0; msg_opl->apply_bytes = msg_opl->sg.size + msg_npl->sg.size; msg_opl->sg.size += msg_npl->sg.size; sk_msg_free(sk, &to->msg_encrypted); sk_msg_xfer_full(&to->msg_encrypted, &from->msg_encrypted); kfree(from); } static int tls_push_record(struct sock *sk, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec, *tmp = NULL; u32 i, split_point, orig_end; struct sk_msg *msg_pl, *msg_en; struct aead_request *req; bool split; int rc; if (!rec) return 0; msg_pl = &rec->msg_plaintext; msg_en = &rec->msg_encrypted; split_point = msg_pl->apply_bytes; split = split_point && split_point < msg_pl->sg.size; if (unlikely((!split && msg_pl->sg.size + prot->overhead_size > msg_en->sg.size) || (split && split_point + prot->overhead_size > msg_en->sg.size))) { split = true; split_point = msg_en->sg.size; } if (split) { rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en, split_point, prot->overhead_size, &orig_end); if (rc < 0) return rc; /* This can happen if above tls_split_open_record allocates * a single large encryption buffer instead of two smaller * ones. In this case adjust pointers and continue without * split. */ if (!msg_pl->sg.size) { tls_merge_open_record(sk, rec, tmp, orig_end); msg_pl = &rec->msg_plaintext; msg_en = &rec->msg_encrypted; split = false; } sk_msg_trim(sk, msg_en, msg_pl->sg.size + prot->overhead_size); } rec->tx_flags = flags; req = &rec->aead_req; i = msg_pl->sg.end; sk_msg_iter_var_prev(i); rec->content_type = record_type; if (prot->version == TLS_1_3_VERSION) { /* Add content type to end of message. No padding added */ sg_set_buf(&rec->sg_content_type, &rec->content_type, 1); sg_mark_end(&rec->sg_content_type); sg_chain(msg_pl->sg.data, msg_pl->sg.end + 1, &rec->sg_content_type); } else { sg_mark_end(sk_msg_elem(msg_pl, i)); } if (msg_pl->sg.end < msg_pl->sg.start) { sg_chain(&msg_pl->sg.data[msg_pl->sg.start], MAX_SKB_FRAGS - msg_pl->sg.start + 1, msg_pl->sg.data); } i = msg_pl->sg.start; sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); i = msg_en->sg.end; sk_msg_iter_var_prev(i); sg_mark_end(sk_msg_elem(msg_en, i)); i = msg_en->sg.start; sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]); tls_make_aad(rec->aad_space, msg_pl->sg.size + prot->tail_size, tls_ctx->tx.rec_seq, record_type, prot); tls_fill_prepend(tls_ctx, page_address(sg_page(&msg_en->sg.data[i])) + msg_en->sg.data[i].offset, msg_pl->sg.size + prot->tail_size, record_type); tls_ctx->pending_open_record_frags = false; rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size + prot->tail_size, i); if (rc < 0) { if (rc != -EINPROGRESS) { tls_err_abort(sk, -EBADMSG); if (split) { tls_ctx->pending_open_record_frags = true; tls_merge_open_record(sk, rec, tmp, orig_end); } } ctx->async_capable = 1; return rc; } else if (split) { msg_pl = &tmp->msg_plaintext; msg_en = &tmp->msg_encrypted; sk_msg_trim(sk, msg_en, msg_pl->sg.size + prot->overhead_size); tls_ctx->pending_open_record_frags = true; ctx->open_rec = tmp; } return tls_tx_records(sk, flags); } static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, bool full_record, u8 record_type, ssize_t *copied, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct sk_msg msg_redir = { }; struct sk_psock *psock; struct sock *sk_redir; struct tls_rec *rec; bool enospc, policy, redir_ingress; int err = 0, send; u32 delta = 0; policy = !(flags & MSG_SENDPAGE_NOPOLICY); psock = sk_psock_get(sk); if (!psock || !policy) { err = tls_push_record(sk, flags, record_type); if (err && err != -EINPROGRESS && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); err = -sk->sk_err; } if (psock) sk_psock_put(sk, psock); return err; } more_data: enospc = sk_msg_full(msg); if (psock->eval == __SK_NONE) { delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); delta -= msg->sg.size; } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc && !full_record) { err = -ENOSPC; goto out_err; } msg->cork_bytes = 0; send = msg->sg.size; if (msg->apply_bytes && msg->apply_bytes < send) send = msg->apply_bytes; switch (psock->eval) { case __SK_PASS: err = tls_push_record(sk, flags, record_type); if (err && err != -EINPROGRESS && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); err = -sk->sk_err; goto out_err; } break; case __SK_REDIRECT: redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; memcpy(&msg_redir, msg, sizeof(*msg)); if (msg->apply_bytes < send) msg->apply_bytes = 0; else msg->apply_bytes -= send; sk_msg_return_zero(sk, msg, send); msg->sg.size -= send; release_sock(sk); err = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, &msg_redir, send, flags); lock_sock(sk); if (err < 0) { *copied -= sk_msg_free_nocharge(sk, &msg_redir); msg->sg.size = 0; } if (msg->sg.size == 0) tls_free_open_rec(sk); break; case __SK_DROP: default: sk_msg_free_partial(sk, msg, send); if (msg->apply_bytes < send) msg->apply_bytes = 0; else msg->apply_bytes -= send; if (msg->sg.size == 0) tls_free_open_rec(sk); *copied -= (send + delta); err = -EACCES; } if (likely(!err)) { bool reset_eval = !ctx->open_rec; rec = ctx->open_rec; if (rec) { msg = &rec->msg_plaintext; if (!msg->apply_bytes) reset_eval = true; } if (reset_eval) { psock->eval = __SK_NONE; if (psock->sk_redir) { sock_put(psock->sk_redir); psock->sk_redir = NULL; } } if (rec) goto more_data; } out_err: sk_psock_put(sk, psock); return err; } static int tls_sw_push_pending_record(struct sock *sk, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; struct sk_msg *msg_pl; size_t copied; if (!rec) return 0; msg_pl = &rec->msg_plaintext; copied = msg_pl->sg.size; if (!copied) return 0; return bpf_exec_tx_verdict(msg_pl, sk, true, TLS_RECORD_TYPE_DATA, &copied, flags); } static int tls_sw_sendmsg_splice(struct sock *sk, struct msghdr *msg, struct sk_msg *msg_pl, size_t try_to_copy, ssize_t *copied) { struct page *page = NULL, **pages = &page; do { ssize_t part; size_t off; part = iov_iter_extract_pages(&msg->msg_iter, &pages, try_to_copy, 1, 0, &off); if (part <= 0) return part ?: -EIO; if (WARN_ON_ONCE(!sendpage_ok(page))) { iov_iter_revert(&msg->msg_iter, part); return -EIO; } sk_msg_page_add(msg_pl, page, part, off); msg_pl->sg.copybreak = 0; msg_pl->sg.curr = msg_pl->sg.end; sk_mem_charge(sk, part); *copied += part; try_to_copy -= part; } while (try_to_copy && !sk_msg_full(msg_pl)); return 0; } static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); bool async_capable = ctx->async_capable; unsigned char record_type = TLS_RECORD_TYPE_DATA; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy; ssize_t copied = 0; struct sk_msg *msg_pl, *msg_en; struct tls_rec *rec; int required_size; int num_async = 0; bool full_record; int record_room; int num_zc = 0; int orig_size; int ret = 0; if (!eor && (msg->msg_flags & MSG_EOR)) return -EINVAL; if (unlikely(msg->msg_controllen)) { ret = tls_process_cmsg(sk, msg, &record_type); if (ret) { if (ret == -EINPROGRESS) num_async++; else if (ret != -EAGAIN) goto send_end; } } while (msg_data_left(msg)) { if (sk->sk_err) { ret = -sk->sk_err; goto send_end; } if (ctx->open_rec) rec = ctx->open_rec; else rec = ctx->open_rec = tls_get_rec(sk); if (!rec) { ret = -ENOMEM; goto send_end; } msg_pl = &rec->msg_plaintext; msg_en = &rec->msg_encrypted; orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; } required_size = msg_pl->sg.size + try_to_copy + prot->overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; alloc_encrypted: ret = tls_alloc_encrypted_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; /* Adjust try_to_copy according to the amount that was * actually allocated. The difference is due * to max sg elements limit */ try_to_copy -= required_size - msg_en->sg.size; full_record = true; } if (try_to_copy && (msg->msg_flags & MSG_SPLICE_PAGES)) { ret = tls_sw_sendmsg_splice(sk, msg, msg_pl, try_to_copy, &copied); if (ret < 0) goto send_end; tls_ctx->pending_open_record_frags = true; if (sk_msg_full(msg_pl)) full_record = true; if (full_record || eor) goto copied; continue; } if (!is_kvec && (full_record || eor) && !async_capable) { u32 first = msg_pl->sg.end; ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter, msg_pl, try_to_copy); if (ret) goto fallback_to_reg_send; num_zc++; copied += try_to_copy; sk_msg_sg_copy_set(msg_pl, first); ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, record_type, &copied, msg->msg_flags); if (ret) { if (ret == -EINPROGRESS) num_async++; else if (ret == -ENOMEM) goto wait_for_memory; else if (ctx->open_rec && ret == -ENOSPC) goto rollback_iter; else if (ret != -EAGAIN) goto send_end; } continue; rollback_iter: copied -= try_to_copy; sk_msg_sg_copy_clear(msg_pl, first); iov_iter_revert(&msg->msg_iter, msg_pl->sg.size - orig_size); fallback_to_reg_send: sk_msg_trim(sk, msg_pl, orig_size); } required_size = msg_pl->sg.size + try_to_copy; ret = tls_clone_plaintext_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto send_end; /* Adjust try_to_copy according to the amount that was * actually allocated. The difference is due * to max sg elements limit */ try_to_copy -= required_size - msg_pl->sg.size; full_record = true; sk_msg_trim(sk, msg_en, msg_pl->sg.size + prot->overhead_size); } if (try_to_copy) { ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl, try_to_copy); if (ret < 0) goto trim_sgl; } /* Open records defined only if successfully copied, otherwise * we would trim the sg but not reset the open record frags. */ tls_ctx->pending_open_record_frags = true; copied += try_to_copy; copied: if (full_record || eor) { ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, record_type, &copied, msg->msg_flags); if (ret) { if (ret == -EINPROGRESS) num_async++; else if (ret == -ENOMEM) goto wait_for_memory; else if (ret != -EAGAIN) { if (ret == -ENOSPC) ret = 0; goto send_end; } } } continue; wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { trim_sgl: if (ctx->open_rec) tls_trim_both_msgs(sk, orig_size); goto send_end; } if (ctx->open_rec && msg_en->sg.size < required_size) goto alloc_encrypted; } if (!num_async) { goto send_end; } else if (num_zc || eor) { int err; /* Wait for pending encryptions to get completed */ err = tls_encrypt_async_wait(ctx); if (err) { ret = err; copied = 0; } } /* Transmit if any encryptions have completed */ if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { cancel_delayed_work(&ctx->tx_work.work); tls_tx_records(sk, msg->msg_flags); } send_end: ret = sk_stream_error(sk, msg->msg_flags, ret); return copied > 0 ? copied : ret; } int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tls_context *tls_ctx = tls_get_ctx(sk); int ret; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_CMSG_COMPAT | MSG_SPLICE_PAGES | MSG_EOR | MSG_SENDPAGE_NOPOLICY)) return -EOPNOTSUPP; ret = mutex_lock_interruptible(&tls_ctx->tx_lock); if (ret) return ret; lock_sock(sk); ret = tls_sw_sendmsg_locked(sk, msg, size); release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); return ret; } /* * Handle unexpected EOF during splice without SPLICE_F_MORE set. */ void tls_sw_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec; struct sk_msg *msg_pl; ssize_t copied = 0; bool retrying = false; int ret = 0; if (!ctx->open_rec) return; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); retry: /* same checks as in tls_sw_push_pending_record() */ rec = ctx->open_rec; if (!rec) goto unlock; msg_pl = &rec->msg_plaintext; if (msg_pl->sg.size == 0) goto unlock; /* Check the BPF advisor and perform transmission. */ ret = bpf_exec_tx_verdict(msg_pl, sk, false, TLS_RECORD_TYPE_DATA, &copied, 0); switch (ret) { case 0: case -EAGAIN: if (retrying) goto unlock; retrying = true; goto retry; case -EINPROGRESS: break; default: goto unlock; } /* Wait for pending encryptions to get completed */ if (tls_encrypt_async_wait(ctx)) goto unlock; /* Transmit if any encryptions have completed */ if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { cancel_delayed_work(&ctx->tx_work.work); tls_tx_records(sk, 0); } unlock: release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); } static int tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, bool released) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; long timeo; /* a rekey is pending, let userspace deal with it */ if (unlikely(ctx->key_update_pending)) return -EKEYEXPIRED; timeo = sock_rcvtimeo(sk, nonblock); while (!tls_strp_msg_ready(ctx)) { if (!sk_psock_queue_empty(psock)) return 0; if (sk->sk_err) return sock_error(sk); if (ret < 0) return ret; if (!skb_queue_empty(&sk->sk_receive_queue)) { tls_strp_check_rcv(&ctx->strp); if (tls_strp_msg_ready(ctx)) break; } if (sk->sk_shutdown & RCV_SHUTDOWN) return 0; if (sock_flag(sk, SOCK_DONE)) return 0; if (!timeo) return -EAGAIN; released = true; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = sk_wait_event(sk, &timeo, tls_strp_msg_ready(ctx) || !sk_psock_queue_empty(psock), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); /* Handle signals */ if (signal_pending(current)) return sock_intr_errno(timeo); } tls_strp_msg_load(&ctx->strp, released); return 1; } static int tls_setup_from_iter(struct iov_iter *from, int length, int *pages_used, struct scatterlist *to, int to_max_pages) { int rc = 0, i = 0, num_elem = *pages_used, maxpages; struct page *pages[MAX_SKB_FRAGS]; unsigned int size = 0; ssize_t copied, use; size_t offset; while (length > 0) { i = 0; maxpages = to_max_pages - num_elem; if (maxpages == 0) { rc = -EFAULT; goto out; } copied = iov_iter_get_pages2(from, pages, length, maxpages, &offset); if (copied <= 0) { rc = -EFAULT; goto out; } length -= copied; size += copied; while (copied) { use = min_t(int, copied, PAGE_SIZE - offset); sg_set_page(&to[num_elem], pages[i], use, offset); sg_unmark_end(&to[num_elem]); /* We do not uncharge memory from this API */ offset = 0; copied -= use; i++; num_elem++; } } /* Mark the end in the last sg entry if newly added */ if (num_elem > *pages_used) sg_mark_end(&to[num_elem - 1]); out: if (rc) iov_iter_revert(from, size); *pages_used = num_elem; return rc; } static struct sk_buff * tls_alloc_clrtxt_skb(struct sock *sk, struct sk_buff *skb, unsigned int full_len) { struct strp_msg *clr_rxm; struct sk_buff *clr_skb; int err; clr_skb = alloc_skb_with_frags(0, full_len, TLS_PAGE_ORDER, &err, sk->sk_allocation); if (!clr_skb) return NULL; skb_copy_header(clr_skb, skb); clr_skb->len = full_len; clr_skb->data_len = full_len; clr_rxm = strp_msg(clr_skb); clr_rxm->offset = 0; return clr_skb; } /* Decrypt handlers * * tls_decrypt_sw() and tls_decrypt_device() are decrypt handlers. * They must transform the darg in/out argument are as follows: * | Input | Output * ------------------------------------------------------------------- * zc | Zero-copy decrypt allowed | Zero-copy performed * async | Async decrypt allowed | Async crypto used / in progress * skb | * | Output skb * * If ZC decryption was performed darg.skb will point to the input skb. */ /* This function decrypts the input skb into either out_iov or in out_sg * or in skb buffers itself. The input parameter 'darg->zc' indicates if * zero-copy mode needs to be tried or not. With zero-copy mode, either * out_iov or out_sg must be non-NULL. In case both out_iov and out_sg are * NULL, then the decryption happens inside skb buffers itself, i.e. * zero-copy gets disabled and 'darg->zc' is updated. */ static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov, struct scatterlist *out_sg, struct tls_decrypt_arg *darg) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; int n_sgin, n_sgout, aead_size, err, pages = 0; struct sk_buff *skb = tls_strp_msg(ctx); const struct strp_msg *rxm = strp_msg(skb); const struct tls_msg *tlm = tls_msg(skb); struct aead_request *aead_req; struct scatterlist *sgin = NULL; struct scatterlist *sgout = NULL; const int data_len = rxm->full_len - prot->overhead_size; int tail_pages = !!prot->tail_size; struct tls_decrypt_ctx *dctx; struct sk_buff *clear_skb; int iv_offset = 0; u8 *mem; n_sgin = skb_nsg(skb, rxm->offset + prot->prepend_size, rxm->full_len - prot->prepend_size); if (n_sgin < 1) return n_sgin ?: -EBADMSG; if (darg->zc && (out_iov || out_sg)) { clear_skb = NULL; if (out_iov) n_sgout = 1 + tail_pages + iov_iter_npages_cap(out_iov, INT_MAX, data_len); else n_sgout = sg_nents(out_sg); } else { darg->zc = false; clear_skb = tls_alloc_clrtxt_skb(sk, skb, rxm->full_len); if (!clear_skb) return -ENOMEM; n_sgout = 1 + skb_shinfo(clear_skb)->nr_frags; } /* Increment to accommodate AAD */ n_sgin = n_sgin + 1; /* Allocate a single block of memory which contains * aead_req || tls_decrypt_ctx. * Both structs are variable length. */ aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv); aead_size = ALIGN(aead_size, __alignof__(*dctx)); mem = kmalloc(aead_size + struct_size(dctx, sg, size_add(n_sgin, n_sgout)), sk->sk_allocation); if (!mem) { err = -ENOMEM; goto exit_free_skb; } /* Segment the allocated memory */ aead_req = (struct aead_request *)mem; dctx = (struct tls_decrypt_ctx *)(mem + aead_size); dctx->sk = sk; sgin = &dctx->sg[0]; sgout = &dctx->sg[n_sgin]; /* For CCM based ciphers, first byte of nonce+iv is a constant */ switch (prot->cipher_type) { case TLS_CIPHER_AES_CCM_128: dctx->iv[0] = TLS_AES_CCM_IV_B0_BYTE; iv_offset = 1; break; case TLS_CIPHER_SM4_CCM: dctx->iv[0] = TLS_SM4_CCM_IV_B0_BYTE; iv_offset = 1; break; } /* Prepare IV */ if (prot->version == TLS_1_3_VERSION || prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) { memcpy(&dctx->iv[iv_offset], tls_ctx->rx.iv, prot->iv_size + prot->salt_size); } else { err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, &dctx->iv[iv_offset] + prot->salt_size, prot->iv_size); if (err < 0) goto exit_free; memcpy(&dctx->iv[iv_offset], tls_ctx->rx.iv, prot->salt_size); } tls_xor_iv_with_seq(prot, &dctx->iv[iv_offset], tls_ctx->rx.rec_seq); /* Prepare AAD */ tls_make_aad(dctx->aad, rxm->full_len - prot->overhead_size + prot->tail_size, tls_ctx->rx.rec_seq, tlm->control, prot); /* Prepare sgin */ sg_init_table(sgin, n_sgin); sg_set_buf(&sgin[0], dctx->aad, prot->aad_size); err = skb_to_sgvec(skb, &sgin[1], rxm->offset + prot->prepend_size, rxm->full_len - prot->prepend_size); if (err < 0) goto exit_free; if (clear_skb) { sg_init_table(sgout, n_sgout); sg_set_buf(&sgout[0], dctx->aad, prot->aad_size); err = skb_to_sgvec(clear_skb, &sgout[1], prot->prepend_size, data_len + prot->tail_size); if (err < 0) goto exit_free; } else if (out_iov) { sg_init_table(sgout, n_sgout); sg_set_buf(&sgout[0], dctx->aad, prot->aad_size); err = tls_setup_from_iter(out_iov, data_len, &pages, &sgout[1], (n_sgout - 1 - tail_pages)); if (err < 0) goto exit_free_pages; if (prot->tail_size) { sg_unmark_end(&sgout[pages]); sg_set_buf(&sgout[pages + 1], &dctx->tail, prot->tail_size); sg_mark_end(&sgout[pages + 1]); } } else if (out_sg) { memcpy(sgout, out_sg, n_sgout * sizeof(*sgout)); } dctx->free_sgout = !!pages; /* Prepare and submit AEAD request */ err = tls_do_decryption(sk, sgin, sgout, dctx->iv, data_len + prot->tail_size, aead_req, darg); if (err) { if (darg->async_done) goto exit_free_skb; goto exit_free_pages; } darg->skb = clear_skb ?: tls_strp_msg(ctx); clear_skb = NULL; if (unlikely(darg->async)) { err = tls_strp_msg_hold(&ctx->strp, &ctx->async_hold); if (err) __skb_queue_tail(&ctx->async_hold, darg->skb); return err; } if (unlikely(darg->async_done)) return 0; if (prot->tail_size) darg->tail = dctx->tail; exit_free_pages: /* Release the pages in case iov was mapped to pages */ for (; pages > 0; pages--) put_page(sg_page(&sgout[pages])); exit_free: kfree(mem); exit_free_skb: consume_skb(clear_skb); return err; } static int tls_decrypt_sw(struct sock *sk, struct tls_context *tls_ctx, struct msghdr *msg, struct tls_decrypt_arg *darg) { struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; struct strp_msg *rxm; int pad, err; err = tls_decrypt_sg(sk, &msg->msg_iter, NULL, darg); if (err < 0) { if (err == -EBADMSG) TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR); return err; } /* keep going even for ->async, the code below is TLS 1.3 */ /* If opportunistic TLS 1.3 ZC failed retry without ZC */ if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION && darg->tail != TLS_RECORD_TYPE_DATA)) { darg->zc = false; if (!darg->tail) TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXNOPADVIOL); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTRETRY); return tls_decrypt_sw(sk, tls_ctx, msg, darg); } pad = tls_padding_length(prot, darg->skb, darg); if (pad < 0) { if (darg->skb != tls_strp_msg(ctx)) consume_skb(darg->skb); return pad; } rxm = strp_msg(darg->skb); rxm->full_len -= pad; return 0; } static int tls_decrypt_device(struct sock *sk, struct msghdr *msg, struct tls_context *tls_ctx, struct tls_decrypt_arg *darg) { struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; struct strp_msg *rxm; int pad, err; if (tls_ctx->rx_conf != TLS_HW) return 0; err = tls_device_decrypted(sk, tls_ctx); if (err <= 0) return err; pad = tls_padding_length(prot, tls_strp_msg(ctx), darg); if (pad < 0) return pad; darg->async = false; darg->skb = tls_strp_msg(ctx); /* ->zc downgrade check, in case TLS 1.3 gets here */ darg->zc &= !(prot->version == TLS_1_3_VERSION && tls_msg(darg->skb)->control != TLS_RECORD_TYPE_DATA); rxm = strp_msg(darg->skb); rxm->full_len -= pad; if (!darg->zc) { /* Non-ZC case needs a real skb */ darg->skb = tls_strp_msg_detach(ctx); if (!darg->skb) return -ENOMEM; } else { unsigned int off, len; /* In ZC case nobody cares about the output skb. * Just copy the data here. Note the skb is not fully trimmed. */ off = rxm->offset + prot->prepend_size; len = rxm->full_len - prot->overhead_size; err = skb_copy_datagram_msg(darg->skb, off, msg, len); if (err) return err; } return 1; } static int tls_check_pending_rekey(struct sock *sk, struct tls_context *ctx, struct sk_buff *skb) { const struct strp_msg *rxm = strp_msg(skb); const struct tls_msg *tlm = tls_msg(skb); char hs_type; int err; if (likely(tlm->control != TLS_RECORD_TYPE_HANDSHAKE)) return 0; if (rxm->full_len < 1) return 0; err = skb_copy_bits(skb, rxm->offset, &hs_type, 1); if (err < 0) { DEBUG_NET_WARN_ON_ONCE(1); return err; } if (hs_type == TLS_HANDSHAKE_KEYUPDATE) { struct tls_sw_context_rx *rx_ctx = ctx->priv_ctx_rx; WRITE_ONCE(rx_ctx->key_update_pending, true); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYRECEIVED); } return 0; } static int tls_rx_one_record(struct sock *sk, struct msghdr *msg, struct tls_decrypt_arg *darg) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct strp_msg *rxm; int err; err = tls_decrypt_device(sk, msg, tls_ctx, darg); if (!err) err = tls_decrypt_sw(sk, tls_ctx, msg, darg); if (err < 0) return err; rxm = strp_msg(darg->skb); rxm->offset += prot->prepend_size; rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, prot, &tls_ctx->rx); return tls_check_pending_rekey(sk, tls_ctx, darg->skb); } int decrypt_skb(struct sock *sk, struct scatterlist *sgout) { struct tls_decrypt_arg darg = { .zc = true, }; return tls_decrypt_sg(sk, NULL, sgout, &darg); } static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm, u8 *control) { int err; if (!*control) { *control = tlm->control; if (!*control) return -EBADMSG; err = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE, sizeof(*control), control); if (*control != TLS_RECORD_TYPE_DATA) { if (err || msg->msg_flags & MSG_CTRUNC) return -EIO; } } else if (*control != tlm->control) { return 0; } return 1; } static void tls_rx_rec_done(struct tls_sw_context_rx *ctx) { tls_strp_msg_done(&ctx->strp); } /* This function traverses the rx_list in tls receive context to copies the * decrypted records into the buffer provided by caller zero copy is not * true. Further, the records are removed from the rx_list if it is not a peek * case and the record has been consumed completely. */ static int process_rx_list(struct tls_sw_context_rx *ctx, struct msghdr *msg, u8 *control, size_t skip, size_t len, bool is_peek, bool *more) { struct sk_buff *skb = skb_peek(&ctx->rx_list); struct tls_msg *tlm; ssize_t copied = 0; int err; while (skip && skb) { struct strp_msg *rxm = strp_msg(skb); tlm = tls_msg(skb); err = tls_record_content_type(msg, tlm, control); if (err <= 0) goto more; if (skip < rxm->full_len) break; skip = skip - rxm->full_len; skb = skb_peek_next(skb, &ctx->rx_list); } while (len && skb) { struct sk_buff *next_skb; struct strp_msg *rxm = strp_msg(skb); int chunk = min_t(unsigned int, rxm->full_len - skip, len); tlm = tls_msg(skb); err = tls_record_content_type(msg, tlm, control); if (err <= 0) goto more; err = skb_copy_datagram_msg(skb, rxm->offset + skip, msg, chunk); if (err < 0) goto more; len = len - chunk; copied = copied + chunk; /* Consume the data from record if it is non-peek case*/ if (!is_peek) { rxm->offset = rxm->offset + chunk; rxm->full_len = rxm->full_len - chunk; /* Return if there is unconsumed data in the record */ if (rxm->full_len - skip) break; } /* The remaining skip-bytes must lie in 1st record in rx_list. * So from the 2nd record, 'skip' should be 0. */ skip = 0; if (msg) msg->msg_flags |= MSG_EOR; next_skb = skb_peek_next(skb, &ctx->rx_list); if (!is_peek) { __skb_unlink(skb, &ctx->rx_list); consume_skb(skb); } skb = next_skb; } err = 0; out: return copied ? : err; more: if (more) *more = true; goto out; } static bool tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot, size_t len_left, size_t decrypted, ssize_t done, size_t *flushed_at) { size_t max_rec; if (len_left <= decrypted) return false; max_rec = prot->overhead_size - prot->tail_size + TLS_MAX_PAYLOAD_SIZE; if (done - *flushed_at < SZ_128K && tcp_inq(sk) > max_rec) return false; *flushed_at = done; return sk_flush_backlog(sk); } static int tls_rx_reader_acquire(struct sock *sk, struct tls_sw_context_rx *ctx, bool nonblock) { long timeo; int ret; timeo = sock_rcvtimeo(sk, nonblock); while (unlikely(ctx->reader_present)) { DEFINE_WAIT_FUNC(wait, woken_wake_function); ctx->reader_contended = 1; add_wait_queue(&ctx->wq, &wait); ret = sk_wait_event(sk, &timeo, !READ_ONCE(ctx->reader_present), &wait); remove_wait_queue(&ctx->wq, &wait); if (timeo <= 0) return -EAGAIN; if (signal_pending(current)) return sock_intr_errno(timeo); if (ret < 0) return ret; } WRITE_ONCE(ctx->reader_present, 1); return 0; } static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, bool nonblock) { int err; lock_sock(sk); err = tls_rx_reader_acquire(sk, ctx, nonblock); if (err) release_sock(sk); return err; } static void tls_rx_reader_release(struct sock *sk, struct tls_sw_context_rx *ctx) { if (unlikely(ctx->reader_contended)) { if (wq_has_sleeper(&ctx->wq)) wake_up(&ctx->wq); else ctx->reader_contended = 0; WARN_ON_ONCE(!ctx->reader_present); } WRITE_ONCE(ctx->reader_present, 0); } static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) { tls_rx_reader_release(sk, ctx); release_sock(sk); } int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; ssize_t decrypted = 0, async_copy_bytes = 0; struct sk_psock *psock; unsigned char control = 0; size_t flushed_at = 0; struct strp_msg *rxm; struct tls_msg *tlm; ssize_t copied = 0; ssize_t peeked = 0; bool async = false; int target, err; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; bool rx_more = false; bool released = true; bool bpf_strp_enabled; bool zc_capable; if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); err = tls_rx_reader_lock(sk, ctx, flags & MSG_DONTWAIT); if (err < 0) return err; psock = sk_psock_get(sk); bpf_strp_enabled = sk_psock_strp_enabled(psock); /* If crypto failed the connection is broken */ err = ctx->async_wait.err; if (err) goto end; /* Process pending decrypted records. It must be non-zero-copy */ err = process_rx_list(ctx, msg, &control, 0, len, is_peek, &rx_more); if (err < 0) goto end; copied = err; if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA) || rx_more) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); len = len - copied; zc_capable = !bpf_strp_enabled && !is_kvec && !is_peek && ctx->zc_capable; decrypted = 0; while (len && (decrypted + copied < target || tls_strp_msg_ready(ctx))) { struct tls_decrypt_arg darg; int to_decrypt, chunk; err = tls_rx_rec_wait(sk, psock, flags & MSG_DONTWAIT, released); if (err <= 0) { if (psock) { chunk = sk_msg_recvmsg(sk, psock, msg, len, flags); if (chunk > 0) { decrypted += chunk; len -= chunk; continue; } } goto recv_end; } memset(&darg.inargs, 0, sizeof(darg.inargs)); rxm = strp_msg(tls_strp_msg(ctx)); tlm = tls_msg(tls_strp_msg(ctx)); to_decrypt = rxm->full_len - prot->overhead_size; if (zc_capable && to_decrypt <= len && tlm->control == TLS_RECORD_TYPE_DATA) darg.zc = true; /* Do not use async mode if record is non-data */ if (tlm->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled) darg.async = ctx->async_capable; else darg.async = false; err = tls_rx_one_record(sk, msg, &darg); if (err < 0) { tls_err_abort(sk, -EBADMSG); goto recv_end; } async |= darg.async; /* If the type of records being processed is not known yet, * set it to record type just dequeued. If it is already known, * but does not match the record type just dequeued, go to end. * We always get record type here since for tls1.2, record type * is known just after record is dequeued from stream parser. * For tls1.3, we disable async. */ err = tls_record_content_type(msg, tls_msg(darg.skb), &control); if (err <= 0) { DEBUG_NET_WARN_ON_ONCE(darg.zc); tls_rx_rec_done(ctx); put_on_rx_list_err: __skb_queue_tail(&ctx->rx_list, darg.skb); goto recv_end; } /* periodically flush backlog, and feed strparser */ released = tls_read_flush_backlog(sk, prot, len, to_decrypt, decrypted + copied, &flushed_at); /* TLS 1.3 may have updated the length by more than overhead */ rxm = strp_msg(darg.skb); chunk = rxm->full_len; tls_rx_rec_done(ctx); if (!darg.zc) { bool partially_consumed = chunk > len; struct sk_buff *skb = darg.skb; DEBUG_NET_WARN_ON_ONCE(darg.skb == ctx->strp.anchor); if (async) { /* TLS 1.2-only, to_decrypt must be text len */ chunk = min_t(int, to_decrypt, len); async_copy_bytes += chunk; put_on_rx_list: decrypted += chunk; len -= chunk; __skb_queue_tail(&ctx->rx_list, skb); if (unlikely(control != TLS_RECORD_TYPE_DATA)) break; continue; } if (bpf_strp_enabled) { released = true; err = sk_psock_tls_strp_read(psock, skb); if (err != __SK_PASS) { rxm->offset = rxm->offset + rxm->full_len; rxm->full_len = 0; if (err == __SK_DROP) consume_skb(skb); continue; } } if (partially_consumed) chunk = len; err = skb_copy_datagram_msg(skb, rxm->offset, msg, chunk); if (err < 0) goto put_on_rx_list_err; if (is_peek) { peeked += chunk; goto put_on_rx_list; } if (partially_consumed) { rxm->offset += chunk; rxm->full_len -= chunk; goto put_on_rx_list; } consume_skb(skb); } decrypted += chunk; len -= chunk; /* Return full control message to userspace before trying * to parse another message type */ msg->msg_flags |= MSG_EOR; if (control != TLS_RECORD_TYPE_DATA) break; } recv_end: if (async) { int ret; /* Wait for all previously submitted records to be decrypted */ ret = tls_decrypt_async_wait(ctx); __skb_queue_purge(&ctx->async_hold); if (ret) { if (err >= 0 || err == -EINPROGRESS) err = ret; goto end; } /* Drain records from the rx_list & copy if required */ if (is_peek) err = process_rx_list(ctx, msg, &control, copied + peeked, decrypted - peeked, is_peek, NULL); else err = process_rx_list(ctx, msg, &control, 0, async_copy_bytes, is_peek, NULL); /* we could have copied less than we wanted, and possibly nothing */ decrypted += max(err, 0) - async_copy_bytes; } copied += decrypted; end: tls_rx_reader_unlock(sk, ctx); if (psock) sk_psock_put(sk, psock); return copied ? : err; } ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct tls_context *tls_ctx = tls_get_ctx(sock->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = NULL; struct sock *sk = sock->sk; struct tls_msg *tlm; struct sk_buff *skb; ssize_t copied = 0; int chunk; int err; err = tls_rx_reader_lock(sk, ctx, flags & SPLICE_F_NONBLOCK); if (err < 0) return err; if (!skb_queue_empty(&ctx->rx_list)) { skb = __skb_dequeue(&ctx->rx_list); } else { struct tls_decrypt_arg darg; err = tls_rx_rec_wait(sk, NULL, flags & SPLICE_F_NONBLOCK, true); if (err <= 0) goto splice_read_end; memset(&darg.inargs, 0, sizeof(darg.inargs)); err = tls_rx_one_record(sk, NULL, &darg); if (err < 0) { tls_err_abort(sk, -EBADMSG); goto splice_read_end; } tls_rx_rec_done(ctx); skb = darg.skb; } rxm = strp_msg(skb); tlm = tls_msg(skb); /* splice does not support reading control messages */ if (tlm->control != TLS_RECORD_TYPE_DATA) { err = -EINVAL; goto splice_requeue; } chunk = min_t(unsigned int, rxm->full_len, len); copied = skb_splice_bits(skb, sk, rxm->offset, pipe, chunk, flags); if (copied < 0) goto splice_requeue; if (chunk < rxm->full_len) { rxm->offset += len; rxm->full_len -= len; goto splice_requeue; } consume_skb(skb); splice_read_end: tls_rx_reader_unlock(sk, ctx); return copied ? : err; splice_requeue: __skb_queue_head(&ctx->rx_list, skb); goto splice_read_end; } int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t read_actor) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; struct strp_msg *rxm = NULL; struct sk_buff *skb = NULL; struct sk_psock *psock; size_t flushed_at = 0; bool released = true; struct tls_msg *tlm; ssize_t copied = 0; ssize_t decrypted; int err, used; psock = sk_psock_get(sk); if (psock) { sk_psock_put(sk, psock); return -EINVAL; } err = tls_rx_reader_acquire(sk, ctx, true); if (err < 0) return err; /* If crypto failed the connection is broken */ err = ctx->async_wait.err; if (err) goto read_sock_end; decrypted = 0; do { if (!skb_queue_empty(&ctx->rx_list)) { skb = __skb_dequeue(&ctx->rx_list); rxm = strp_msg(skb); tlm = tls_msg(skb); } else { struct tls_decrypt_arg darg; err = tls_rx_rec_wait(sk, NULL, true, released); if (err <= 0) goto read_sock_end; memset(&darg.inargs, 0, sizeof(darg.inargs)); err = tls_rx_one_record(sk, NULL, &darg); if (err < 0) { tls_err_abort(sk, -EBADMSG); goto read_sock_end; } released = tls_read_flush_backlog(sk, prot, INT_MAX, 0, decrypted, &flushed_at); skb = darg.skb; rxm = strp_msg(skb); tlm = tls_msg(skb); decrypted += rxm->full_len; tls_rx_rec_done(ctx); } /* read_sock does not support reading control messages */ if (tlm->control != TLS_RECORD_TYPE_DATA) { err = -EINVAL; goto read_sock_requeue; } used = read_actor(desc, skb, rxm->offset, rxm->full_len); if (used <= 0) { if (!copied) err = used; goto read_sock_requeue; } copied += used; if (used < rxm->full_len) { rxm->offset += used; rxm->full_len -= used; if (!desc->count) goto read_sock_requeue; } else { consume_skb(skb); if (!desc->count) skb = NULL; } } while (skb); read_sock_end: tls_rx_reader_release(sk, ctx); return copied ? : err; read_sock_requeue: __skb_queue_head(&ctx->rx_list, skb); goto read_sock_end; } bool tls_sw_sock_is_readable(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); bool ingress_empty = true; struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) ingress_empty = list_empty(&psock->ingress_msg); rcu_read_unlock(); return !ingress_empty || tls_strp_msg_ready(ctx) || !skb_queue_empty(&ctx->rx_list); } int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_prot_info *prot = &tls_ctx->prot_info; char header[TLS_HEADER_SIZE + TLS_MAX_IV_SIZE]; size_t cipher_overhead; size_t data_len = 0; int ret; /* Verify that we have a full TLS header, or wait for more data */ if (strp->stm.offset + prot->prepend_size > skb->len) return 0; /* Sanity-check size of on-stack buffer. */ if (WARN_ON(prot->prepend_size > sizeof(header))) { ret = -EINVAL; goto read_failure; } /* Linearize header to local buffer */ ret = skb_copy_bits(skb, strp->stm.offset, header, prot->prepend_size); if (ret < 0) goto read_failure; strp->mark = header[0]; data_len = ((header[4] & 0xFF) | (header[3] << 8)); cipher_overhead = prot->tag_size; if (prot->version != TLS_1_3_VERSION && prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) cipher_overhead += prot->iv_size; if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead + prot->tail_size) { ret = -EMSGSIZE; goto read_failure; } if (data_len < cipher_overhead) { ret = -EBADMSG; goto read_failure; } /* Note that both TLS1.3 and TLS1.2 use TLS_1_2 version here */ if (header[1] != TLS_1_2_VERSION_MINOR || header[2] != TLS_1_2_VERSION_MAJOR) { ret = -EINVAL; goto read_failure; } tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE, TCP_SKB_CB(skb)->seq + strp->stm.offset); return data_len + TLS_HEADER_SIZE; read_failure: tls_err_abort(strp->sk, ret); return ret; } void tls_rx_msg_ready(struct tls_strparser *strp) { struct tls_sw_context_rx *ctx; ctx = container_of(strp, struct tls_sw_context_rx, strp); ctx->saved_data_ready(strp->sk); } static void tls_data_ready(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_psock *psock; gfp_t alloc_save; trace_sk_data_ready(sk); alloc_save = sk->sk_allocation; sk->sk_allocation = GFP_ATOMIC; tls_strp_data_ready(&ctx->strp); sk->sk_allocation = alloc_save; psock = sk_psock_get(sk); if (psock) { if (!list_empty(&psock->ingress_msg)) ctx->saved_data_ready(sk); sk_psock_put(sk, psock); } } void tls_sw_cancel_work_tx(struct tls_context *tls_ctx) { struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); set_bit(BIT_TX_CLOSING, &ctx->tx_bitmask); set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask); cancel_delayed_work_sync(&ctx->tx_work.work); } void tls_sw_release_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec, *tmp; /* Wait for any pending async encryptions to complete */ tls_encrypt_async_wait(ctx); tls_tx_records(sk, -1); /* Free up un-sent records in tx_list. First, free * the partially sent record if any at head of tx_list. */ if (tls_ctx->partially_sent_record) { tls_free_partial_record(sk, tls_ctx); rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); list_del(&rec->list); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { list_del(&rec->list); sk_msg_free(sk, &rec->msg_encrypted); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } crypto_free_aead(ctx->aead_send); tls_free_open_rec(sk); } void tls_sw_free_ctx_tx(struct tls_context *tls_ctx) { struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); kfree(ctx); } void tls_sw_release_resources_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); if (ctx->aead_recv) { __skb_queue_purge(&ctx->rx_list); crypto_free_aead(ctx->aead_recv); tls_strp_stop(&ctx->strp); /* If tls_sw_strparser_arm() was not called (cleanup paths) * we still want to tls_strp_stop(), but sk->sk_data_ready was * never swapped. */ if (ctx->saved_data_ready) { write_lock_bh(&sk->sk_callback_lock); sk->sk_data_ready = ctx->saved_data_ready; write_unlock_bh(&sk->sk_callback_lock); } } } void tls_sw_strparser_done(struct tls_context *tls_ctx) { struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); tls_strp_done(&ctx->strp); } void tls_sw_free_ctx_rx(struct tls_context *tls_ctx) { struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); kfree(ctx); } void tls_sw_free_resources_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); tls_sw_release_resources_rx(sk); tls_sw_free_ctx_rx(tls_ctx); } /* The work handler to transmitt the encrypted records in tx_list */ static void tx_work_handler(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct tx_work *tx_work = container_of(delayed_work, struct tx_work, work); struct sock *sk = tx_work->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx; if (unlikely(!tls_ctx)) return; ctx = tls_sw_ctx_tx(tls_ctx); if (test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask)) return; if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) return; if (mutex_trylock(&tls_ctx->tx_lock)) { lock_sock(sk); tls_tx_records(sk, -1); release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); } else if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { /* Someone is holding the tx_lock, they will likely run Tx * and cancel the work on their way out of the lock section. * Schedule a long delay just in case. */ schedule_delayed_work(&ctx->tx_work.work, msecs_to_jiffies(10)); } } static bool tls_is_tx_ready(struct tls_sw_context_tx *ctx) { struct tls_rec *rec; rec = list_first_entry_or_null(&ctx->tx_list, struct tls_rec, list); if (!rec) return false; return READ_ONCE(rec->tx_ready); } void tls_sw_write_space(struct sock *sk, struct tls_context *ctx) { struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx); /* Schedule the transmission if tx list is ready */ if (tls_is_tx_ready(tx_ctx) && !test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask)) schedule_delayed_work(&tx_ctx->tx_work.work, 0); } void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx) { struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx); write_lock_bh(&sk->sk_callback_lock); rx_ctx->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); } void tls_update_rx_zc_capable(struct tls_context *tls_ctx) { struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx); rx_ctx->zc_capable = tls_ctx->rx_no_pad || tls_ctx->prot_info.version != TLS_1_3_VERSION; } static struct tls_sw_context_tx *init_ctx_tx(struct tls_context *ctx, struct sock *sk) { struct tls_sw_context_tx *sw_ctx_tx; if (!ctx->priv_ctx_tx) { sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); if (!sw_ctx_tx) return NULL; } else { sw_ctx_tx = ctx->priv_ctx_tx; } crypto_init_wait(&sw_ctx_tx->async_wait); atomic_set(&sw_ctx_tx->encrypt_pending, 1); INIT_LIST_HEAD(&sw_ctx_tx->tx_list); INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler); sw_ctx_tx->tx_work.sk = sk; return sw_ctx_tx; } static struct tls_sw_context_rx *init_ctx_rx(struct tls_context *ctx) { struct tls_sw_context_rx *sw_ctx_rx; if (!ctx->priv_ctx_rx) { sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); if (!sw_ctx_rx) return NULL; } else { sw_ctx_rx = ctx->priv_ctx_rx; } crypto_init_wait(&sw_ctx_rx->async_wait); atomic_set(&sw_ctx_rx->decrypt_pending, 1); init_waitqueue_head(&sw_ctx_rx->wq); skb_queue_head_init(&sw_ctx_rx->rx_list); skb_queue_head_init(&sw_ctx_rx->async_hold); return sw_ctx_rx; } int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { u16 nonce_size = cipher_desc->nonce; if (crypto_info->version == TLS_1_3_VERSION) { nonce_size = 0; prot->aad_size = TLS_HEADER_SIZE; prot->tail_size = 1; } else { prot->aad_size = TLS_AAD_SPACE_SIZE; prot->tail_size = 0; } /* Sanity-check the sizes for stack allocations. */ if (nonce_size > TLS_MAX_IV_SIZE || prot->aad_size > TLS_MAX_AAD_SIZE) return -EINVAL; prot->version = crypto_info->version; prot->cipher_type = crypto_info->cipher_type; prot->prepend_size = TLS_HEADER_SIZE + nonce_size; prot->tag_size = cipher_desc->tag; prot->overhead_size = prot->prepend_size + prot->tag_size + prot->tail_size; prot->iv_size = cipher_desc->iv; prot->salt_size = cipher_desc->salt; prot->rec_seq_size = cipher_desc->rec_seq; return 0; } static void tls_finish_key_update(struct sock *sk, struct tls_context *tls_ctx) { struct tls_sw_context_rx *ctx = tls_ctx->priv_ctx_rx; WRITE_ONCE(ctx->key_update_pending, false); /* wake-up pre-existing poll() */ ctx->saved_data_ready(sk); } int tls_set_sw_offload(struct sock *sk, int tx, struct tls_crypto_info *new_crypto_info) { struct tls_crypto_info *crypto_info, *src_crypto_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; struct tls_sw_context_rx *sw_ctx_rx = NULL; const struct tls_cipher_desc *cipher_desc; char *iv, *rec_seq, *key, *salt; struct cipher_context *cctx; struct tls_prot_info *prot; struct crypto_aead **aead; struct tls_context *ctx; struct crypto_tfm *tfm; int rc = 0; ctx = tls_get_ctx(sk); prot = &ctx->prot_info; /* new_crypto_info != NULL means rekey */ if (!new_crypto_info) { if (tx) { ctx->priv_ctx_tx = init_ctx_tx(ctx, sk); if (!ctx->priv_ctx_tx) return -ENOMEM; } else { ctx->priv_ctx_rx = init_ctx_rx(ctx); if (!ctx->priv_ctx_rx) return -ENOMEM; } } if (tx) { sw_ctx_tx = ctx->priv_ctx_tx; crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { sw_ctx_rx = ctx->priv_ctx_rx; crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; } src_crypto_info = new_crypto_info ?: crypto_info; cipher_desc = get_cipher_desc(src_crypto_info->cipher_type); if (!cipher_desc) { rc = -EINVAL; goto free_priv; } rc = init_prot_info(prot, src_crypto_info, cipher_desc); if (rc) goto free_priv; iv = crypto_info_iv(src_crypto_info, cipher_desc); key = crypto_info_key(src_crypto_info, cipher_desc); salt = crypto_info_salt(src_crypto_info, cipher_desc); rec_seq = crypto_info_rec_seq(src_crypto_info, cipher_desc); if (!*aead) { *aead = crypto_alloc_aead(cipher_desc->cipher_name, 0, 0); if (IS_ERR(*aead)) { rc = PTR_ERR(*aead); *aead = NULL; goto free_priv; } } ctx->push_pending_record = tls_sw_push_pending_record; /* setkey is the last operation that could fail during a * rekey. if it succeeds, we can start modifying the * context. */ rc = crypto_aead_setkey(*aead, key, cipher_desc->key); if (rc) { if (new_crypto_info) goto out; else goto free_aead; } if (!new_crypto_info) { rc = crypto_aead_setauthsize(*aead, prot->tag_size); if (rc) goto free_aead; } if (!tx && !new_crypto_info) { tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); tls_update_rx_zc_capable(ctx); sw_ctx_rx->async_capable = src_crypto_info->version != TLS_1_3_VERSION && !!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC); rc = tls_strp_init(&sw_ctx_rx->strp, sk); if (rc) goto free_aead; } memcpy(cctx->iv, salt, cipher_desc->salt); memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv); memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq); if (new_crypto_info) { unsafe_memcpy(crypto_info, new_crypto_info, cipher_desc->crypto_info, /* size was checked in do_tls_setsockopt_conf */); memzero_explicit(new_crypto_info, cipher_desc->crypto_info); if (!tx) tls_finish_key_update(sk, ctx); } goto out; free_aead: crypto_free_aead(*aead); *aead = NULL; free_priv: if (!new_crypto_info) { if (tx) { kfree(ctx->priv_ctx_tx); ctx->priv_ctx_tx = NULL; } else { kfree(ctx->priv_ctx_rx); ctx->priv_ctx_rx = NULL; } } out: return rc; }
29 26 3 19 19 13 13 5 19 19 19 7 11 12 12 8 4 12 7 6 1 4 7 1 1 3 2 1 1 1 1 3 3 1 3 1 18 1 1 1 39 1 3 35 32 2 17 2 1 6 9 14 13 1 4 1 7 12 19 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 /* net/tipc/udp_media.c: IP bearer support for TIPC * * Copyright (c) 2015, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/list.h> #include <net/sock.h> #include <net/ip.h> #include <net/udp_tunnel.h> #include <net/ipv6_stubs.h> #include <linux/tipc_netlink.h> #include "core.h" #include "addr.h" #include "net.h" #include "bearer.h" #include "netlink.h" #include "msg.h" #include "udp_media.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 #define UDP_MIN_HEADROOM 48 /** * struct udp_media_addr - IP/UDP addressing information * * This is the bearer level originating address used in neighbor discovery * messages, and all fields should be in network byte order * * @proto: Ethernet protocol in use * @port: port being used * @ipv4: IPv4 address of neighbor * @ipv6: IPv6 address of neighbor */ struct udp_media_addr { __be16 proto; __be16 port; union { struct in_addr ipv4; struct in6_addr ipv6; }; }; /* struct udp_replicast - container for UDP remote addresses */ struct udp_replicast { struct udp_media_addr addr; struct dst_cache dst_cache; struct rcu_head rcu; struct list_head list; }; /** * struct udp_bearer - ip/udp bearer data structure * @bearer: associated generic tipc bearer * @ubsock: bearer associated socket * @ifindex: local address scope * @work: used to schedule deferred work on a bearer * @rcast: associated udp_replicast container */ struct udp_bearer { struct tipc_bearer __rcu *bearer; struct socket *ubsock; u32 ifindex; struct work_struct work; struct udp_replicast rcast; }; static int tipc_udp_is_mcast_addr(struct udp_media_addr *addr) { if (ntohs(addr->proto) == ETH_P_IP) return ipv4_is_multicast(addr->ipv4.s_addr); #if IS_ENABLED(CONFIG_IPV6) else return ipv6_addr_is_multicast(&addr->ipv6); #endif return 0; } /* udp_media_addr_set - convert a ip/udp address to a TIPC media address */ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, struct udp_media_addr *ua) { memset(addr, 0, sizeof(struct tipc_media_addr)); addr->media_id = TIPC_MEDIA_TYPE_UDP; memcpy(addr->value, ua, sizeof(struct udp_media_addr)); if (tipc_udp_is_mcast_addr(ua)) addr->broadcast = TIPC_BROADCAST_SUPPORT; } /* tipc_udp_addr2str - convert ip/udp address to string */ static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size) { struct udp_media_addr *ua = (struct udp_media_addr *)&a->value; if (ntohs(ua->proto) == ETH_P_IP) snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->port)); else if (ntohs(ua->proto) == ETH_P_IPV6) snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->port)); else { pr_err("Invalid UDP media address\n"); return 1; } return 0; } /* tipc_udp_msg2addr - extract an ip/udp address from a TIPC ndisc message */ static int tipc_udp_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *a, char *msg) { struct udp_media_addr *ua; ua = (struct udp_media_addr *) (msg + TIPC_MEDIA_ADDR_OFFSET); if (msg[TIPC_MEDIA_TYPE_OFFSET] != TIPC_MEDIA_TYPE_UDP) return -EINVAL; tipc_udp_media_addr_set(a, ua); return 0; } /* tipc_udp_addr2msg - write an ip/udp address to a TIPC ndisc message */ static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_UDP; memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, a->value, sizeof(struct udp_media_addr)); return 0; } /* tipc_send_msg - enqueue a send request */ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, struct udp_bearer *ub, struct udp_media_addr *src, struct udp_media_addr *dst, struct dst_cache *cache) { struct dst_entry *ndst; int ttl, err = 0; local_bh_disable(); ndst = dst_cache_get(cache); if (dst->proto == htons(ETH_P_IP)) { struct rtable *rt = dst_rtable(ndst); if (!rt) { struct flowi4 fl = { .daddr = dst->ipv4.s_addr, .saddr = src->ipv4.s_addr, .flowi4_mark = skb->mark, .flowi4_proto = IPPROTO_UDP }; rt = ip_route_output_key(net, &fl); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto tx_error; } dst_cache_set_ip4(cache, &rt->dst, fl.saddr); } ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->port, dst->port, false, true); #if IS_ENABLED(CONFIG_IPV6) } else { if (!ndst) { struct flowi6 fl6 = { .flowi6_oif = ub->ifindex, .daddr = dst->ipv6, .saddr = src->ipv6, .flowi6_proto = IPPROTO_UDP }; ndst = ipv6_stub->ipv6_dst_lookup_flow(net, ub->ubsock->sk, &fl6, NULL); if (IS_ERR(ndst)) { err = PTR_ERR(ndst); goto tx_error; } dst_cache_set_ip6(cache, ndst, &fl6.saddr); } ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, &src->ipv6, &dst->ipv6, 0, ttl, 0, src->port, dst->port, false); #endif } local_bh_enable(); return err; tx_error: local_bh_enable(); kfree_skb(skb); return err; } static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *addr) { struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct udp_media_addr *dst = (struct udp_media_addr *)&addr->value; struct udp_replicast *rcast; struct udp_bearer *ub; int err = 0; if (skb_headroom(skb) < UDP_MIN_HEADROOM) { err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); if (err) goto out; } skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); ub = rcu_dereference(b->media_ptr); if (!ub) { err = -ENODEV; goto out; } if (addr->broadcast != TIPC_REPLICAST_SUPPORT) return tipc_udp_xmit(net, skb, ub, src, dst, &ub->rcast.dst_cache); /* Replicast, send an skb to each configured IP address */ list_for_each_entry_rcu(rcast, &ub->rcast.list, list) { struct sk_buff *_skb; _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) { err = -ENOMEM; goto out; } err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr, &rcast->dst_cache); if (err) goto out; } err = 0; out: kfree_skb(skb); return err; } static bool tipc_udp_is_known_peer(struct tipc_bearer *b, struct udp_media_addr *addr) { struct udp_replicast *rcast, *tmp; struct udp_bearer *ub; ub = rcu_dereference_rtnl(b->media_ptr); if (!ub) { pr_err_ratelimited("UDP bearer instance not found\n"); return false; } list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { if (!memcmp(&rcast->addr, addr, sizeof(struct udp_media_addr))) return true; } return false; } static int tipc_udp_rcast_add(struct tipc_bearer *b, struct udp_media_addr *addr) { struct udp_replicast *rcast; struct udp_bearer *ub; ub = rcu_dereference_rtnl(b->media_ptr); if (!ub) return -ENODEV; rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC); if (!rcast) return -ENOMEM; if (dst_cache_init(&rcast->dst_cache, GFP_ATOMIC)) { kfree(rcast); return -ENOMEM; } memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr)); if (ntohs(addr->proto) == ETH_P_IP) pr_info("New replicast peer: %pI4\n", &rcast->addr.ipv4); #if IS_ENABLED(CONFIG_IPV6) else if (ntohs(addr->proto) == ETH_P_IPV6) pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6); #endif b->bcast_addr.broadcast = TIPC_REPLICAST_SUPPORT; list_add_rcu(&rcast->list, &ub->rcast.list); return 0; } static int tipc_udp_rcast_disc(struct tipc_bearer *b, struct sk_buff *skb) { struct udp_media_addr src = {0}; struct udp_media_addr *dst; dst = (struct udp_media_addr *)&b->bcast_addr.value; if (tipc_udp_is_mcast_addr(dst)) return 0; src.port = udp_hdr(skb)->source; if (ip_hdr(skb)->version == 4) { struct iphdr *iphdr = ip_hdr(skb); src.proto = htons(ETH_P_IP); src.ipv4.s_addr = iphdr->saddr; if (ipv4_is_multicast(iphdr->daddr)) return 0; #if IS_ENABLED(CONFIG_IPV6) } else if (ip_hdr(skb)->version == 6) { struct ipv6hdr *iphdr = ipv6_hdr(skb); src.proto = htons(ETH_P_IPV6); src.ipv6 = iphdr->saddr; if (ipv6_addr_is_multicast(&iphdr->daddr)) return 0; #endif } else { return 0; } if (likely(tipc_udp_is_known_peer(b, &src))) return 0; return tipc_udp_rcast_add(b, &src); } /* tipc_udp_recv - read data from bearer socket */ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) { struct udp_bearer *ub; struct tipc_bearer *b; struct tipc_msg *hdr; int err; ub = rcu_dereference_sk_user_data(sk); if (!ub) { pr_err_ratelimited("Failed to get UDP bearer reference"); goto out; } skb_pull(skb, sizeof(struct udphdr)); hdr = buf_msg(skb); b = rcu_dereference(ub->bearer); if (!b) goto out; if (b && test_bit(0, &b->up)) { TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(sock_net(sk), skb, b); return 0; } if (unlikely(msg_user(hdr) == LINK_CONFIG)) { err = tipc_udp_rcast_disc(b, skb); if (err) goto out; } out: kfree_skb(skb); return 0; } static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) { int err = 0; struct ip_mreqn mreqn; struct sock *sk = ub->ubsock->sk; if (ntohs(remote->proto) == ETH_P_IP) { mreqn.imr_multiaddr = remote->ipv4; mreqn.imr_ifindex = ub->ifindex; err = ip_mc_join_group(sk, &mreqn); #if IS_ENABLED(CONFIG_IPV6) } else { lock_sock(sk); err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex, &remote->ipv6); release_sock(sk); #endif } return err; } static int __tipc_nl_add_udp_addr(struct sk_buff *skb, struct udp_media_addr *addr, int nla_t) { if (ntohs(addr->proto) == ETH_P_IP) { struct sockaddr_in ip4; memset(&ip4, 0, sizeof(ip4)); ip4.sin_family = AF_INET; ip4.sin_port = addr->port; ip4.sin_addr.s_addr = addr->ipv4.s_addr; if (nla_put(skb, nla_t, sizeof(ip4), &ip4)) return -EMSGSIZE; #if IS_ENABLED(CONFIG_IPV6) } else if (ntohs(addr->proto) == ETH_P_IPV6) { struct sockaddr_in6 ip6; memset(&ip6, 0, sizeof(ip6)); ip6.sin6_family = AF_INET6; ip6.sin6_port = addr->port; memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr)); if (nla_put(skb, nla_t, sizeof(ip6), &ip6)) return -EMSGSIZE; #endif } return 0; } int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) { u32 bid = cb->args[0]; u32 skip_cnt = cb->args[1]; u32 portid = NETLINK_CB(cb->skb).portid; struct udp_replicast *rcast, *tmp; struct tipc_bearer *b; struct udp_bearer *ub; void *hdr; int err; int i; if (!bid && !skip_cnt) { struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct net *net = sock_net(skb->sk); struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; char *bname; if (!attrs[TIPC_NLA_BEARER]) return -EINVAL; err = nla_parse_nested_deprecated(battrs, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER], tipc_nl_bearer_policy, NULL); if (err) return err; if (!battrs[TIPC_NLA_BEARER_NAME]) return -EINVAL; bname = nla_data(battrs[TIPC_NLA_BEARER_NAME]); rtnl_lock(); b = tipc_bearer_find(net, bname); if (!b) { rtnl_unlock(); return -EINVAL; } bid = b->identity; } else { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); rtnl_lock(); b = rtnl_dereference(tn->bearer_list[bid]); if (!b) { rtnl_unlock(); return -EINVAL; } } ub = rtnl_dereference(b->media_ptr); if (!ub) { rtnl_unlock(); return -EINVAL; } i = 0; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { if (i < skip_cnt) goto count; hdr = genlmsg_put(skb, portid, cb->nlh->nlmsg_seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_BEARER_GET); if (!hdr) goto done; err = __tipc_nl_add_udp_addr(skb, &rcast->addr, TIPC_NLA_UDP_REMOTE); if (err) { genlmsg_cancel(skb, hdr); goto done; } genlmsg_end(skb, hdr); count: i++; } done: rtnl_unlock(); cb->args[0] = bid; cb->args[1] = i; return skb->len; } int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b) { struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct udp_media_addr *dst; struct udp_bearer *ub; struct nlattr *nest; ub = rtnl_dereference(b->media_ptr); if (!ub) return -ENODEV; nest = nla_nest_start_noflag(msg->skb, TIPC_NLA_BEARER_UDP_OPTS); if (!nest) goto msg_full; if (__tipc_nl_add_udp_addr(msg->skb, src, TIPC_NLA_UDP_LOCAL)) goto msg_full; dst = (struct udp_media_addr *)&b->bcast_addr.value; if (__tipc_nl_add_udp_addr(msg->skb, dst, TIPC_NLA_UDP_REMOTE)) goto msg_full; if (!list_empty(&ub->rcast.list)) { if (nla_put_flag(msg->skb, TIPC_NLA_UDP_MULTI_REMOTEIP)) goto msg_full; } nla_nest_end(msg->skb, nest); return 0; msg_full: nla_nest_cancel(msg->skb, nest); return -EMSGSIZE; } /** * tipc_parse_udp_addr - build udp media address from netlink data * @nla: netlink attribute containing sockaddr storage aligned address * @addr: tipc media address to fill with address, port and protocol type * @scope_id: IPv6 scope id pointer, not NULL indicates it's required */ static int tipc_parse_udp_addr(struct nlattr *nla, struct udp_media_addr *addr, u32 *scope_id) { struct sockaddr_storage sa; nla_memcpy(&sa, nla, sizeof(sa)); if (sa.ss_family == AF_INET) { struct sockaddr_in *ip4 = (struct sockaddr_in *)&sa; addr->proto = htons(ETH_P_IP); addr->port = ip4->sin_port; addr->ipv4.s_addr = ip4->sin_addr.s_addr; return 0; #if IS_ENABLED(CONFIG_IPV6) } else if (sa.ss_family == AF_INET6) { struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)&sa; addr->proto = htons(ETH_P_IPV6); addr->port = ip6->sin6_port; memcpy(&addr->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); /* Scope ID is only interesting for local addresses */ if (scope_id) { int atype; atype = ipv6_addr_type(&ip6->sin6_addr); if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id) { return -EINVAL; } *scope_id = ip6->sin6_scope_id ? : 0; } return 0; #endif } return -EADDRNOTAVAIL; } int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr) { int err; struct udp_media_addr addr = {0}; struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; struct udp_media_addr *dst; if (nla_parse_nested_deprecated(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy, NULL)) return -EINVAL; if (!opts[TIPC_NLA_UDP_REMOTE]) return -EINVAL; err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &addr, NULL); if (err) return err; dst = (struct udp_media_addr *)&b->bcast_addr.value; if (tipc_udp_is_mcast_addr(dst)) { pr_err("Can't add remote ip to TIPC UDP multicast bearer\n"); return -EINVAL; } if (tipc_udp_is_known_peer(b, &addr)) return 0; return tipc_udp_rcast_add(b, &addr); } /** * tipc_udp_enable - callback to create a new udp bearer instance * @net: network namespace * @b: pointer to generic tipc_bearer * @attrs: netlink bearer configuration * * validate the bearer parameters and initialize the udp bearer * rtnl_lock should be held */ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]) { int err = -EINVAL; struct udp_bearer *ub; struct udp_media_addr remote = {0}; struct udp_media_addr local = {0}; struct udp_port_cfg udp_conf = {0}; struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; u8 node_id[NODE_ID_LEN] = {0,}; struct net_device *dev; int rmcast = 0; ub = kzalloc(sizeof(*ub), GFP_ATOMIC); if (!ub) return -ENOMEM; INIT_LIST_HEAD(&ub->rcast.list); if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) goto err; if (nla_parse_nested_deprecated(opts, TIPC_NLA_UDP_MAX, attrs[TIPC_NLA_BEARER_UDP_OPTS], tipc_nl_udp_policy, NULL)) goto err; if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) { pr_err("Invalid UDP bearer configuration"); err = -EINVAL; goto err; } err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_LOCAL], &local, &ub->ifindex); if (err) goto err; err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &remote, NULL); if (err) goto err; if (remote.proto != local.proto) { err = -EINVAL; goto err; } /* Checking remote ip address */ rmcast = tipc_udp_is_mcast_addr(&remote); /* Autoconfigure own node identity if needed */ if (!tipc_own_id(net)) { memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16); tipc_net_init(net, node_id, 0); } if (!tipc_own_id(net)) { pr_warn("Failed to set node id, please configure manually\n"); err = -EINVAL; goto err; } b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; rcu_assign_pointer(b->media_ptr, ub); rcu_assign_pointer(ub->bearer, b); tipc_udp_media_addr_set(&b->addr, &local); if (local.proto == htons(ETH_P_IP)) { dev = __ip_dev_find(net, local.ipv4.s_addr, false); if (!dev) { err = -ENODEV; goto err; } udp_conf.family = AF_INET; /* Switch to use ANY to receive packets from group */ if (rmcast) udp_conf.local_ip.s_addr = htonl(INADDR_ANY); else udp_conf.local_ip.s_addr = local.ipv4.s_addr; udp_conf.use_udp_checksums = false; ub->ifindex = dev->ifindex; b->encap_hlen = sizeof(struct iphdr) + sizeof(struct udphdr); b->mtu = b->media->mtu; #if IS_ENABLED(CONFIG_IPV6) } else if (local.proto == htons(ETH_P_IPV6)) { dev = ub->ifindex ? __dev_get_by_index(net, ub->ifindex) : NULL; dev = ipv6_dev_find(net, &local.ipv6, dev); if (!dev) { err = -ENODEV; goto err; } udp_conf.family = AF_INET6; udp_conf.use_udp6_tx_checksums = true; udp_conf.use_udp6_rx_checksums = true; if (rmcast) udp_conf.local_ip6 = in6addr_any; else udp_conf.local_ip6 = local.ipv6; ub->ifindex = dev->ifindex; b->encap_hlen = sizeof(struct ipv6hdr) + sizeof(struct udphdr); b->mtu = 1280; #endif } else { err = -EAFNOSUPPORT; goto err; } udp_conf.local_udp_port = local.port; err = udp_sock_create(net, &udp_conf, &ub->ubsock); if (err) goto err; tuncfg.sk_user_data = ub; tuncfg.encap_type = 1; tuncfg.encap_rcv = tipc_udp_recv; tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); err = dst_cache_init(&ub->rcast.dst_cache, GFP_ATOMIC); if (err) goto free; /* * The bcast media address port is used for all peers and the ip * is used if it's a multicast address. */ memcpy(&b->bcast_addr.value, &remote, sizeof(remote)); if (rmcast) err = enable_mcast(ub, &remote); else err = tipc_udp_rcast_add(b, &remote); if (err) goto free; return 0; free: dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); err: kfree(ub); return err; } /* cleanup_bearer - break the socket/bearer association */ static void cleanup_bearer(struct work_struct *work) { struct udp_bearer *ub = container_of(work, struct udp_bearer, work); struct udp_replicast *rcast, *tmp; struct tipc_net *tn; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { dst_cache_destroy(&rcast->dst_cache); list_del_rcu(&rcast->list); kfree_rcu(rcast, rcu); } tn = tipc_net(sock_net(ub->ubsock->sk)); dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); /* Note: could use a call_rcu() to avoid another synchronize_net() */ synchronize_net(); atomic_dec(&tn->wq_count); kfree(ub); } /* tipc_udp_disable - detach bearer from socket */ static void tipc_udp_disable(struct tipc_bearer *b) { struct udp_bearer *ub; ub = rtnl_dereference(b->media_ptr); if (!ub) { pr_err("UDP bearer instance not found\n"); return; } sock_set_flag(ub->ubsock->sk, SOCK_DEAD); RCU_INIT_POINTER(ub->bearer, NULL); /* sock_release need to be done outside of rtnl lock */ atomic_inc(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); INIT_WORK(&ub->work, cleanup_bearer); schedule_work(&ub->work); } struct tipc_media udp_media_info = { .send_msg = tipc_udp_send_msg, .enable_media = tipc_udp_enable, .disable_media = tipc_udp_disable, .addr2str = tipc_udp_addr2str, .addr2msg = tipc_udp_addr2msg, .msg2addr = tipc_udp_msg2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_DEF_LINK_WIN, .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, .name = "udp" };
13689 157 14149 41 13570 13553 131 6 3 2 1 1 2 5 842 6 13 862 123 137 106 31 761 173 174 47 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NET_SCM_H #define __LINUX_NET_SCM_H #include <linux/limits.h> #include <linux/net.h> #include <linux/cred.h> #include <linux/file.h> #include <linux/security.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/sched/signal.h> #include <net/compat.h> /* Well, we should have at least one descriptor open * to accept passed FDs 8) */ #define SCM_MAX_FD 253 struct scm_creds { u32 pid; kuid_t uid; kgid_t gid; }; #ifdef CONFIG_UNIX struct unix_edge; #endif struct scm_fp_list { short count; short count_unix; short max; #ifdef CONFIG_UNIX bool inflight; bool dead; struct list_head vertices; struct unix_edge *edges; #endif struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; struct scm_cookie { struct pid *pid; /* Skb credentials */ struct scm_fp_list *fp; /* Passed files */ struct scm_creds creds; /* Skb credentials */ #ifdef CONFIG_SECURITY_NETWORK u32 secid; /* Passed security ID */ #endif }; void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm); void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm); void __scm_destroy(struct scm_cookie *scm); struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl); #ifdef CONFIG_SECURITY_NETWORK static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) { security_socket_getpeersec_dgram(sock, NULL, &scm->secid); } #else static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) { } #endif /* CONFIG_SECURITY_NETWORK */ static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; } static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) { scm_destroy_cred(scm); if (scm->fp) __scm_destroy(scm); } static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, bool forcecreds) { memset(scm, 0, sizeof(*scm)); scm->creds.uid = INVALID_UID; scm->creds.gid = INVALID_GID; if (forcecreds) scm_set_cred(scm, task_tgid(current), current_uid(), current_gid()); unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; return __scm_send(sock, msg, scm); } #ifdef CONFIG_SECURITY_NETWORK static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { struct lsm_context ctx; int err; if (test_bit(SOCK_PASSSEC, &sock->flags)) { err = security_secid_to_secctx(scm->secid, &ctx); if (err >= 0) { put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, ctx.context); security_release_secctx(&ctx); } } } static inline bool scm_has_secdata(struct socket *sock) { return test_bit(SOCK_PASSSEC, &sock->flags); } #else static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { } static inline bool scm_has_secdata(struct socket *sock) { return false; } #endif /* CONFIG_SECURITY_NETWORK */ static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) { struct file *pidfd_file = NULL; int len, pidfd; /* put_cmsg() doesn't return an error if CMSG is truncated, * that's why we need to opencode these checks here. */ if (msg->msg_flags & MSG_CMSG_COMPAT) len = sizeof(struct compat_cmsghdr) + sizeof(int); else len = sizeof(struct cmsghdr) + sizeof(int); if (msg->msg_controllen < len) { msg->msg_flags |= MSG_CTRUNC; return; } if (!scm->pid) return; pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { if (pidfd_file) { put_unused_fd(pidfd); fput(pidfd_file); } return; } if (pidfd_file) fd_install(pidfd, pidfd_file); } static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!msg->msg_control) { if (test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags) || scm->fp || scm_has_secdata(sock)) msg->msg_flags |= MSG_CTRUNC; scm_destroy(scm); return false; } if (test_bit(SOCK_PASSCRED, &sock->flags)) { struct user_namespace *current_ns = current_user_ns(); struct ucred ucreds = { .pid = scm->creds.pid, .uid = from_kuid_munged(current_ns, scm->creds.uid), .gid = from_kgid_munged(current_ns, scm->creds.gid), }; put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); } scm_passec(sock, msg, scm); if (scm->fp) scm_detach_fds(msg, scm); return true; } static inline void scm_recv(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!__scm_recv_common(sock, msg, scm, flags)) return; scm_destroy_cred(scm); } static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!__scm_recv_common(sock, msg, scm, flags)) return; if (test_bit(SOCK_PASSPIDFD, &sock->flags)) scm_pidfd_recv(msg, scm); scm_destroy_cred(scm); } static inline int scm_recv_one_fd(struct file *f, int __user *ufd, unsigned int flags) { if (!ufd) return -EFAULT; return receive_fd(f, ufd, flags); } #endif /* __LINUX_NET_SCM_H */
1694 1694 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 // SPDX-License-Identifier: GPL-2.0-only /* * lib/hexdump.c */ #include <linux/types.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/minmax.h> #include <linux/export.h> #include <linux/unaligned.h> const char hex_asc[] = "0123456789abcdef"; EXPORT_SYMBOL(hex_asc); const char hex_asc_upper[] = "0123456789ABCDEF"; EXPORT_SYMBOL(hex_asc_upper); /** * hex_to_bin - convert a hex digit to its real value * @ch: ascii character represents hex digit * * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad * input. * * This function is used to load cryptographic keys, so it is coded in such a * way that there are no conditions or memory accesses that depend on data. * * Explanation of the logic: * (ch - '9' - 1) is negative if ch <= '9' * ('0' - 1 - ch) is negative if ch >= '0' * we "and" these two values, so the result is negative if ch is in the range * '0' ... '9' * we are only interested in the sign, so we do a shift ">> 8"; note that right * shift of a negative value is implementation-defined, so we cast the * value to (unsigned) before the shift --- we have 0xffffff if ch is in * the range '0' ... '9', 0 otherwise * we "and" this value with (ch - '0' + 1) --- we have a value 1 ... 10 if ch is * in the range '0' ... '9', 0 otherwise * we add this value to -1 --- we have a value 0 ... 9 if ch is in the range '0' * ... '9', -1 otherwise * the next line is similar to the previous one, but we need to decode both * uppercase and lowercase letters, so we use (ch & 0xdf), which converts * lowercase to uppercase */ int hex_to_bin(unsigned char ch) { unsigned char cu = ch & 0xdf; return -1 + ((ch - '0' + 1) & (unsigned)((ch - '9' - 1) & ('0' - 1 - ch)) >> 8) + ((cu - 'A' + 11) & (unsigned)((cu - 'F' - 1) & ('A' - 1 - cu)) >> 8); } EXPORT_SYMBOL(hex_to_bin); /** * hex2bin - convert an ascii hexadecimal string to its binary representation * @dst: binary result * @src: ascii hexadecimal string * @count: result length * * Return 0 on success, -EINVAL in case of bad input. */ int hex2bin(u8 *dst, const char *src, size_t count) { while (count--) { int hi, lo; hi = hex_to_bin(*src++); if (unlikely(hi < 0)) return -EINVAL; lo = hex_to_bin(*src++); if (unlikely(lo < 0)) return -EINVAL; *dst++ = (hi << 4) | lo; } return 0; } EXPORT_SYMBOL(hex2bin); /** * bin2hex - convert binary data to an ascii hexadecimal string * @dst: ascii hexadecimal result * @src: binary data * @count: binary data length */ char *bin2hex(char *dst, const void *src, size_t count) { const unsigned char *_src = src; while (count--) dst = hex_byte_pack(dst, *_src++); return dst; } EXPORT_SYMBOL(bin2hex); /** * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory * @buf: data blob to dump * @len: number of bytes in the @buf * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @linebuf: where to put the converted data * @linebuflen: total size of @linebuf, including space for terminating NUL * @ascii: include ASCII after the hex output * * hex_dump_to_buffer() works on one "line" of output at a time, i.e., * 16 or 32 bytes of input data converted to hex + ASCII output. * * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data * to a hex + ASCII dump at the supplied memory location. * The converted output is always NUL-terminated. * * E.g.: * hex_dump_to_buffer(frame->data, frame->len, 16, 1, * linebuf, sizeof(linebuf), true); * * example output buffer: * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO * * Return: * The amount of bytes placed in the buffer without terminating NUL. If the * output was truncated, then the return value is the number of bytes * (excluding the terminating NUL) which would have been written to the final * string if enough space had been available. */ int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, char *linebuf, size_t linebuflen, bool ascii) { const u8 *ptr = buf; int ngroups; u8 ch; int j, lx = 0; int ascii_column; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; if (len > rowsize) /* limit to one line at a time */ len = rowsize; if (!is_power_of_2(groupsize) || groupsize > 8) groupsize = 1; if ((len % groupsize) != 0) /* no mixed size output */ groupsize = 1; ngroups = len / groupsize; ascii_column = rowsize * 2 + rowsize / groupsize + 1; if (!linebuflen) goto overflow1; if (!len) goto nil; if (groupsize == 8) { const u64 *ptr8 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%16.16llx", j ? " " : "", get_unaligned(ptr8 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else if (groupsize == 4) { const u32 *ptr4 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%8.8x", j ? " " : "", get_unaligned(ptr4 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else if (groupsize == 2) { const u16 *ptr2 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%4.4x", j ? " " : "", get_unaligned(ptr2 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else { for (j = 0; j < len; j++) { if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = hex_asc_hi(ch); if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = hex_asc_lo(ch); if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = ' '; } if (j) lx--; } if (!ascii) goto nil; while (lx < ascii_column) { if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = ' '; } for (j = 0; j < len; j++) { if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.'; } nil: linebuf[lx] = '\0'; return lx; overflow2: linebuf[lx++] = '\0'; overflow1: return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1; } EXPORT_SYMBOL(hex_dump_to_buffer); #ifdef CONFIG_PRINTK /** * print_hex_dump - print a text hex dump to syslog for a binary blob of data * @level: kernel log level (e.g. KERN_DEBUG) * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @buf: data blob to dump * @len: number of bytes in the @buf * @ascii: include ASCII after the hex output * * Given a buffer of u8 data, print_hex_dump() prints a hex + ASCII dump * to the kernel log at the specified kernel log level, with an optional * leading prefix. * * print_hex_dump() works on one "line" of output at a time, i.e., * 16 or 32 bytes of input data converted to hex + ASCII output. * print_hex_dump() iterates over the entire input @buf, breaking it into * "line size" chunks to format and print. * * E.g.: * print_hex_dump(KERN_DEBUG, "raw data: ", DUMP_PREFIX_ADDRESS, * 16, 1, frame->data, frame->len, true); * * Example output using %DUMP_PREFIX_OFFSET and 1-byte mode: * 0009ab42: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO * Example output using %DUMP_PREFIX_ADDRESS and 4-byte mode: * ffffffff88089af0: 73727170 77767574 7b7a7978 7f7e7d7c pqrstuvwxyz{|}~. */ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; unsigned char linebuf[32 * 3 + 2 + 32 + 1]; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len; i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, linebuf, sizeof(linebuf), ascii); switch (prefix_type) { case DUMP_PREFIX_ADDRESS: printk("%s%s%p: %s\n", level, prefix_str, ptr + i, linebuf); break; case DUMP_PREFIX_OFFSET: printk("%s%s%.8x: %s\n", level, prefix_str, i, linebuf); break; default: printk("%s%s%s\n", level, prefix_str, linebuf); break; } } } EXPORT_SYMBOL(print_hex_dump); #endif /* defined(CONFIG_PRINTK) */
14 2 3 1 5 28 20 24 20 12 15 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 /* SPDX-License-Identifier: GPL-2.0-only */ /* * An interface between IEEE802.15.4 device and rest of the kernel. * * Copyright (C) 2007-2012 Siemens AG * * Written by: * Pavel Smolenskiy <pavel.smolenskiy@gmail.com> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> * Maxim Osipov <maxim.osipov@siemens.com> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #ifndef IEEE802154_NETDEVICE_H #define IEEE802154_NETDEVICE_H #define IEEE802154_REQUIRED_SIZE(struct_type, member) \ (offsetof(typeof(struct_type), member) + \ sizeof(((typeof(struct_type) *)(NULL))->member)) #define IEEE802154_ADDR_OFFSET \ offsetof(typeof(struct sockaddr_ieee802154), addr) #define IEEE802154_MIN_NAMELEN (IEEE802154_ADDR_OFFSET + \ IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, addr_type)) #define IEEE802154_NAMELEN_SHORT (IEEE802154_ADDR_OFFSET + \ IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, short_addr)) #define IEEE802154_NAMELEN_LONG (IEEE802154_ADDR_OFFSET + \ IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, hwaddr)) #include <net/af_ieee802154.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/ieee802154.h> #include <net/cfg802154.h> struct ieee802154_beacon_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) u16 beacon_order:4, superframe_order:4, final_cap_slot:4, battery_life_ext:1, reserved0:1, pan_coordinator:1, assoc_permit:1; u8 gts_count:3, gts_reserved:4, gts_permit:1; u8 pend_short_addr_count:3, reserved1:1, pend_ext_addr_count:3, reserved2:1; #elif defined(__BIG_ENDIAN_BITFIELD) u16 assoc_permit:1, pan_coordinator:1, reserved0:1, battery_life_ext:1, final_cap_slot:4, superframe_order:4, beacon_order:4; u8 gts_permit:1, gts_reserved:4, gts_count:3; u8 reserved2:1, pend_ext_addr_count:3, reserved1:1, pend_short_addr_count:3; #else #error "Please fix <asm/byteorder.h>" #endif } __packed; struct ieee802154_mac_cmd_pl { u8 cmd_id; } __packed; struct ieee802154_sechdr { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 level:3, key_id_mode:2, reserved:3; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:3, key_id_mode:2, level:3; #else #error "Please fix <asm/byteorder.h>" #endif u8 key_id; __le32 frame_counter; union { __le32 short_src; __le64 extended_src; }; }; struct ieee802154_hdr_fc { #if defined(__LITTLE_ENDIAN_BITFIELD) u16 type:3, security_enabled:1, frame_pending:1, ack_request:1, intra_pan:1, reserved:3, dest_addr_mode:2, version:2, source_addr_mode:2; #elif defined(__BIG_ENDIAN_BITFIELD) u16 reserved:1, intra_pan:1, ack_request:1, frame_pending:1, security_enabled:1, type:3, source_addr_mode:2, version:2, dest_addr_mode:2, reserved2:2; #else #error "Please fix <asm/byteorder.h>" #endif }; struct ieee802154_assoc_req_pl { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved1:1, device_type:1, power_source:1, rx_on_when_idle:1, assoc_type:1, reserved2:1, security_cap:1, alloc_addr:1; #elif defined(__BIG_ENDIAN_BITFIELD) u8 alloc_addr:1, security_cap:1, reserved2:1, assoc_type:1, rx_on_when_idle:1, power_source:1, device_type:1, reserved1:1; #else #error "Please fix <asm/byteorder.h>" #endif } __packed; struct ieee802154_assoc_resp_pl { __le16 short_addr; u8 status; } __packed; enum ieee802154_frame_version { IEEE802154_2003_STD, IEEE802154_2006_STD, IEEE802154_STD, IEEE802154_RESERVED_STD, IEEE802154_MULTIPURPOSE_STD = IEEE802154_2003_STD, }; enum ieee802154_addressing_mode { IEEE802154_NO_ADDRESSING, IEEE802154_RESERVED, IEEE802154_SHORT_ADDRESSING, IEEE802154_EXTENDED_ADDRESSING, }; enum ieee802154_association_status { IEEE802154_ASSOCIATION_SUCCESSFUL = 0x00, IEEE802154_PAN_AT_CAPACITY = 0x01, IEEE802154_PAN_ACCESS_DENIED = 0x02, IEEE802154_HOPPING_SEQUENCE_OFFSET_DUP = 0x03, IEEE802154_FAST_ASSOCIATION_SUCCESSFUL = 0x80, }; enum ieee802154_disassociation_reason { IEEE802154_COORD_WISHES_DEVICE_TO_LEAVE = 0x1, IEEE802154_DEVICE_WISHES_TO_LEAVE = 0x2, }; struct ieee802154_hdr { struct ieee802154_hdr_fc fc; u8 seq; struct ieee802154_addr source; struct ieee802154_addr dest; struct ieee802154_sechdr sec; }; struct ieee802154_beacon_frame { struct ieee802154_hdr mhr; struct ieee802154_beacon_hdr mac_pl; }; struct ieee802154_mac_cmd_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; }; struct ieee802154_beacon_req_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; }; struct ieee802154_association_req_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; struct ieee802154_assoc_req_pl assoc_req_pl; }; struct ieee802154_association_resp_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; struct ieee802154_assoc_resp_pl assoc_resp_pl; }; struct ieee802154_disassociation_notif_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; u8 disassoc_pl; }; /* pushes hdr onto the skb. fields of hdr->fc that can be calculated from * the contents of hdr will be, and the actual value of those bits in * hdr->fc will be ignored. this includes the INTRA_PAN bit and the frame * version, if SECEN is set. */ int ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr); /* pulls the entire 802.15.4 header off of the skb, including the security * header, and performs pan id decompression */ int ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr); /* parses the frame control, sequence number of address fields in a given skb * and stores them into hdr, performing pan id decompression and length checks * to be suitable for use in header_ops.parse */ int ieee802154_hdr_peek_addrs(const struct sk_buff *skb, struct ieee802154_hdr *hdr); /* parses the full 802.15.4 header a given skb and stores them into hdr, * performing pan id decompression and length checks to be suitable for use in * header_ops.parse */ int ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr); /* pushes/pulls various frame types into/from an skb */ int ieee802154_beacon_push(struct sk_buff *skb, struct ieee802154_beacon_frame *beacon); int ieee802154_mac_cmd_push(struct sk_buff *skb, void *frame, const void *pl, unsigned int pl_len); int ieee802154_mac_cmd_pl_pull(struct sk_buff *skb, struct ieee802154_mac_cmd_pl *mac_pl); int ieee802154_max_payload(const struct ieee802154_hdr *hdr); static inline int ieee802154_sechdr_authtag_len(const struct ieee802154_sechdr *sec) { switch (sec->level) { case IEEE802154_SCF_SECLEVEL_MIC32: case IEEE802154_SCF_SECLEVEL_ENC_MIC32: return 4; case IEEE802154_SCF_SECLEVEL_MIC64: case IEEE802154_SCF_SECLEVEL_ENC_MIC64: return 8; case IEEE802154_SCF_SECLEVEL_MIC128: case IEEE802154_SCF_SECLEVEL_ENC_MIC128: return 16; case IEEE802154_SCF_SECLEVEL_NONE: case IEEE802154_SCF_SECLEVEL_ENC: default: return 0; } } static inline int ieee802154_hdr_length(struct sk_buff *skb) { struct ieee802154_hdr hdr; int len = ieee802154_hdr_pull(skb, &hdr); if (len > 0) skb_push(skb, len); return len; } static inline bool ieee802154_addr_equal(const struct ieee802154_addr *a1, const struct ieee802154_addr *a2) { if (a1->pan_id != a2->pan_id || a1->mode != a2->mode) return false; if ((a1->mode == IEEE802154_ADDR_LONG && a1->extended_addr != a2->extended_addr) || (a1->mode == IEEE802154_ADDR_SHORT && a1->short_addr != a2->short_addr)) return false; return true; } static inline __le64 ieee802154_devaddr_from_raw(const void *raw) { u64 temp; memcpy(&temp, raw, IEEE802154_ADDR_LEN); return (__force __le64)swab64(temp); } static inline void ieee802154_devaddr_to_raw(void *raw, __le64 addr) { u64 temp = swab64((__force u64)addr); memcpy(raw, &temp, IEEE802154_ADDR_LEN); } static inline int ieee802154_sockaddr_check_size(struct sockaddr_ieee802154 *daddr, int len) { struct ieee802154_addr_sa *sa; int ret = 0; sa = &daddr->addr; if (len < IEEE802154_MIN_NAMELEN) return -EINVAL; switch (sa->addr_type) { case IEEE802154_ADDR_NONE: break; case IEEE802154_ADDR_SHORT: if (len < IEEE802154_NAMELEN_SHORT) ret = -EINVAL; break; case IEEE802154_ADDR_LONG: if (len < IEEE802154_NAMELEN_LONG) ret = -EINVAL; break; default: ret = -EINVAL; break; } return ret; } static inline void ieee802154_addr_from_sa(struct ieee802154_addr *a, const struct ieee802154_addr_sa *sa) { a->mode = sa->addr_type; a->pan_id = cpu_to_le16(sa->pan_id); switch (a->mode) { case IEEE802154_ADDR_SHORT: a->short_addr = cpu_to_le16(sa->short_addr); break; case IEEE802154_ADDR_LONG: a->extended_addr = ieee802154_devaddr_from_raw(sa->hwaddr); break; } } static inline void ieee802154_addr_to_sa(struct ieee802154_addr_sa *sa, const struct ieee802154_addr *a) { sa->addr_type = a->mode; sa->pan_id = le16_to_cpu(a->pan_id); switch (a->mode) { case IEEE802154_ADDR_SHORT: sa->short_addr = le16_to_cpu(a->short_addr); break; case IEEE802154_ADDR_LONG: ieee802154_devaddr_to_raw(sa->hwaddr, a->extended_addr); break; } } /* * A control block of skb passed between the ARPHRD_IEEE802154 device * and other stack parts. */ struct ieee802154_mac_cb { u8 lqi; u8 type; bool ackreq; bool secen; bool secen_override; u8 seclevel; bool seclevel_override; struct ieee802154_addr source; struct ieee802154_addr dest; }; static inline struct ieee802154_mac_cb *mac_cb(struct sk_buff *skb) { return (struct ieee802154_mac_cb *)skb->cb; } static inline struct ieee802154_mac_cb *mac_cb_init(struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct ieee802154_mac_cb) > sizeof(skb->cb)); memset(skb->cb, 0, sizeof(struct ieee802154_mac_cb)); return mac_cb(skb); } enum { IEEE802154_LLSEC_DEVKEY_IGNORE, IEEE802154_LLSEC_DEVKEY_RESTRICT, IEEE802154_LLSEC_DEVKEY_RECORD, __IEEE802154_LLSEC_DEVKEY_MAX, }; #define IEEE802154_MAC_SCAN_ED 0 #define IEEE802154_MAC_SCAN_ACTIVE 1 #define IEEE802154_MAC_SCAN_PASSIVE 2 #define IEEE802154_MAC_SCAN_ORPHAN 3 struct ieee802154_mac_params { s8 transmit_power; u8 min_be; u8 max_be; u8 csma_retries; s8 frame_retries; bool lbt; struct wpan_phy_cca cca; s32 cca_ed_level; }; struct wpan_phy; enum { IEEE802154_LLSEC_PARAM_ENABLED = BIT(0), IEEE802154_LLSEC_PARAM_FRAME_COUNTER = BIT(1), IEEE802154_LLSEC_PARAM_OUT_LEVEL = BIT(2), IEEE802154_LLSEC_PARAM_OUT_KEY = BIT(3), IEEE802154_LLSEC_PARAM_KEY_SOURCE = BIT(4), IEEE802154_LLSEC_PARAM_PAN_ID = BIT(5), IEEE802154_LLSEC_PARAM_HWADDR = BIT(6), IEEE802154_LLSEC_PARAM_COORD_HWADDR = BIT(7), IEEE802154_LLSEC_PARAM_COORD_SHORTADDR = BIT(8), }; struct ieee802154_llsec_ops { int (*get_params)(struct net_device *dev, struct ieee802154_llsec_params *params); int (*set_params)(struct net_device *dev, const struct ieee802154_llsec_params *params, int changed); int (*add_key)(struct net_device *dev, const struct ieee802154_llsec_key_id *id, const struct ieee802154_llsec_key *key); int (*del_key)(struct net_device *dev, const struct ieee802154_llsec_key_id *id); int (*add_dev)(struct net_device *dev, const struct ieee802154_llsec_device *llsec_dev); int (*del_dev)(struct net_device *dev, __le64 dev_addr); int (*add_devkey)(struct net_device *dev, __le64 device_addr, const struct ieee802154_llsec_device_key *key); int (*del_devkey)(struct net_device *dev, __le64 device_addr, const struct ieee802154_llsec_device_key *key); int (*add_seclevel)(struct net_device *dev, const struct ieee802154_llsec_seclevel *sl); int (*del_seclevel)(struct net_device *dev, const struct ieee802154_llsec_seclevel *sl); void (*lock_table)(struct net_device *dev); void (*get_table)(struct net_device *dev, struct ieee802154_llsec_table **t); void (*unlock_table)(struct net_device *dev); }; /* * This should be located at net_device->ml_priv * * get_phy should increment the reference counting on returned phy. * Use wpan_wpy_put to put that reference. */ struct ieee802154_mlme_ops { /* The following fields are optional (can be NULL). */ int (*assoc_req)(struct net_device *dev, struct ieee802154_addr *addr, u8 channel, u8 page, u8 cap); int (*assoc_resp)(struct net_device *dev, struct ieee802154_addr *addr, __le16 short_addr, u8 status); int (*disassoc_req)(struct net_device *dev, struct ieee802154_addr *addr, u8 reason); int (*start_req)(struct net_device *dev, struct ieee802154_addr *addr, u8 channel, u8 page, u8 bcn_ord, u8 sf_ord, u8 pan_coord, u8 blx, u8 coord_realign); int (*scan_req)(struct net_device *dev, u8 type, u32 channels, u8 page, u8 duration); int (*set_mac_params)(struct net_device *dev, const struct ieee802154_mac_params *params); void (*get_mac_params)(struct net_device *dev, struct ieee802154_mac_params *params); const struct ieee802154_llsec_ops *llsec; }; static inline struct ieee802154_mlme_ops * ieee802154_mlme_ops(const struct net_device *dev) { return dev->ml_priv; } #endif
1495 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 // SPDX-License-Identifier: GPL-2.0 /* * USB-ACPI glue code * * Copyright 2012 Red Hat <mjg@redhat.com> */ #include <linux/module.h> #include <linux/usb.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/pci.h> #include <linux/usb/hcd.h> #include "hub.h" /** * usb_acpi_power_manageable - check whether usb port has * acpi power resource. * @hdev: USB device belonging to the usb hub * @index: port index based zero * * Return true if the port has acpi power resource and false if no. */ bool usb_acpi_power_manageable(struct usb_device *hdev, int index) { acpi_handle port_handle; int port1 = index + 1; port_handle = usb_get_hub_port_acpi_handle(hdev, port1); if (port_handle) return acpi_bus_power_manageable(port_handle); else return false; } EXPORT_SYMBOL_GPL(usb_acpi_power_manageable); #define UUID_USB_CONTROLLER_DSM "ce2ee385-00e6-48cb-9f05-2edb927c4899" #define USB_DSM_DISABLE_U1_U2_FOR_PORT 5 /** * usb_acpi_port_lpm_incapable - check if lpm should be disabled for a port. * @hdev: USB device belonging to the usb hub * @index: zero based port index * * Some USB3 ports may not support USB3 link power management U1/U2 states * due to different retimer setup. ACPI provides _DSM method which returns 0x01 * if U1 and U2 states should be disabled. Evaluate _DSM with: * Arg0: UUID = ce2ee385-00e6-48cb-9f05-2edb927c4899 * Arg1: Revision ID = 0 * Arg2: Function Index = 5 * Arg3: (empty) * * Return 1 if USB3 port is LPM incapable, negative on error, otherwise 0 */ int usb_acpi_port_lpm_incapable(struct usb_device *hdev, int index) { union acpi_object *obj; acpi_handle port_handle; int port1 = index + 1; guid_t guid; int ret; ret = guid_parse(UUID_USB_CONTROLLER_DSM, &guid); if (ret) return ret; port_handle = usb_get_hub_port_acpi_handle(hdev, port1); if (!port_handle) { dev_dbg(&hdev->dev, "port-%d no acpi handle\n", port1); return -ENODEV; } if (!acpi_check_dsm(port_handle, &guid, 0, BIT(USB_DSM_DISABLE_U1_U2_FOR_PORT))) { dev_dbg(&hdev->dev, "port-%d no _DSM function %d\n", port1, USB_DSM_DISABLE_U1_U2_FOR_PORT); return -ENODEV; } obj = acpi_evaluate_dsm_typed(port_handle, &guid, 0, USB_DSM_DISABLE_U1_U2_FOR_PORT, NULL, ACPI_TYPE_INTEGER); if (!obj) { dev_dbg(&hdev->dev, "evaluate port-%d _DSM failed\n", port1); return -EINVAL; } if (obj->integer.value == 0x01) ret = 1; ACPI_FREE(obj); return ret; } EXPORT_SYMBOL_GPL(usb_acpi_port_lpm_incapable); /** * usb_acpi_set_power_state - control usb port's power via acpi power * resource * @hdev: USB device belonging to the usb hub * @index: port index based zero * @enable: power state expected to be set * * Notice to use usb_acpi_power_manageable() to check whether the usb port * has acpi power resource before invoking this function. * * Returns 0 on success, else negative errno. */ int usb_acpi_set_power_state(struct usb_device *hdev, int index, bool enable) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_port *port_dev; acpi_handle port_handle; unsigned char state; int port1 = index + 1; int error = -EINVAL; if (!hub) return -ENODEV; port_dev = hub->ports[port1 - 1]; port_handle = (acpi_handle) usb_get_hub_port_acpi_handle(hdev, port1); if (!port_handle) return error; if (enable) state = ACPI_STATE_D0; else state = ACPI_STATE_D3_COLD; error = acpi_bus_set_power(port_handle, state); if (!error) dev_dbg(&port_dev->dev, "acpi: power was set to %d\n", enable); else dev_dbg(&port_dev->dev, "acpi: power failed to be set\n"); return error; } EXPORT_SYMBOL_GPL(usb_acpi_set_power_state); /** * usb_acpi_add_usb4_devlink - add device link to USB4 Host Interface for tunneled USB3 devices * * @udev: Tunneled USB3 device connected to a roothub. * * Adds a device link between a tunneled USB3 device and the USB4 Host Interface * device to ensure correct runtime PM suspend and resume order. This function * should only be called for tunneled USB3 devices. * The USB4 Host Interface this tunneled device depends on is found from the roothub * port ACPI device specific data _DSD entry. * * Return: negative error code on failure, 0 otherwise */ static int usb_acpi_add_usb4_devlink(struct usb_device *udev) { const struct device_link *link; struct usb_port *port_dev; struct usb_hub *hub; if (!udev->parent || udev->parent->parent) return 0; hub = usb_hub_to_struct_hub(udev->parent); port_dev = hub->ports[udev->portnum - 1]; struct fwnode_handle *nhi_fwnode __free(fwnode_handle) = fwnode_find_reference(dev_fwnode(&port_dev->dev), "usb4-host-interface", 0); if (IS_ERR(nhi_fwnode) || !nhi_fwnode->dev) return 0; link = device_link_add(&port_dev->child->dev, nhi_fwnode->dev, DL_FLAG_STATELESS | DL_FLAG_RPM_ACTIVE | DL_FLAG_PM_RUNTIME); if (!link) { dev_err(&port_dev->dev, "Failed to created device link from %s to %s\n", dev_name(&port_dev->child->dev), dev_name(nhi_fwnode->dev)); return -EINVAL; } dev_dbg(&port_dev->dev, "Created device link from %s to %s\n", dev_name(&port_dev->child->dev), dev_name(nhi_fwnode->dev)); return 0; } /* * Private to usb-acpi, all the core needs to know is that * port_dev->location is non-zero when it has been set by the firmware. */ #define USB_ACPI_LOCATION_VALID (1 << 31) static void usb_acpi_get_connect_type(struct usb_port *port_dev, acpi_handle *handle) { enum usb_port_connect_type connect_type = USB_PORT_CONNECT_TYPE_UNKNOWN; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *upc = NULL; struct acpi_pld_info *pld = NULL; acpi_status status; /* * According to 9.14 in ACPI Spec 6.2. _PLD indicates whether usb port * is user visible and _UPC indicates whether it is connectable. If * the port was visible and connectable, it could be freely connected * and disconnected with USB devices. If no visible and connectable, * a usb device is directly hard-wired to the port. If no visible and * no connectable, the port would be not used. */ if (acpi_get_physical_device_location(handle, &pld) && pld) port_dev->location = USB_ACPI_LOCATION_VALID | pld->group_token << 8 | pld->group_position; status = acpi_evaluate_object(handle, "_UPC", NULL, &buffer); if (ACPI_FAILURE(status)) goto out; upc = buffer.pointer; if (!upc || (upc->type != ACPI_TYPE_PACKAGE) || upc->package.count != 4) goto out; /* UPC states port is connectable */ if (upc->package.elements[0].integer.value) if (!pld) ; /* keep connect_type as unknown */ else if (pld->user_visible) connect_type = USB_PORT_CONNECT_TYPE_HOT_PLUG; else connect_type = USB_PORT_CONNECT_TYPE_HARD_WIRED; else connect_type = USB_PORT_NOT_USED; out: port_dev->connect_type = connect_type; kfree(upc); ACPI_FREE(pld); } static struct acpi_device * usb_acpi_get_companion_for_port(struct usb_port *port_dev) { struct usb_device *udev; struct acpi_device *adev; acpi_handle *parent_handle; int port1; /* Get the struct usb_device point of port's hub */ udev = to_usb_device(port_dev->dev.parent->parent); /* * The root hub ports' parent is the root hub. The non-root-hub * ports' parent is the parent hub port which the hub is * connected to. */ if (!udev->parent) { adev = ACPI_COMPANION(&udev->dev); port1 = usb_hcd_find_raw_port_number(bus_to_hcd(udev->bus), port_dev->portnum); } else { parent_handle = usb_get_hub_port_acpi_handle(udev->parent, udev->portnum); if (!parent_handle) return NULL; adev = acpi_fetch_acpi_dev(parent_handle); port1 = port_dev->portnum; } return acpi_find_child_by_adr(adev, port1); } static struct acpi_device * usb_acpi_find_companion_for_port(struct usb_port *port_dev) { struct acpi_device *adev; adev = usb_acpi_get_companion_for_port(port_dev); if (!adev) return NULL; usb_acpi_get_connect_type(port_dev, adev->handle); return adev; } static struct acpi_device * usb_acpi_find_companion_for_device(struct usb_device *udev) { struct acpi_device *adev; struct usb_port *port_dev; struct usb_hub *hub; if (!udev->parent) { /* * root hub is only child (_ADR=0) under its parent, the HC. * sysdev pointer is the HC as seen from firmware. */ adev = ACPI_COMPANION(udev->bus->sysdev); return acpi_find_child_device(adev, 0, false); } hub = usb_hub_to_struct_hub(udev->parent); if (!hub) return NULL; /* Tunneled USB3 devices depend on USB4 Host Interface, set device link to it */ if (udev->speed >= USB_SPEED_SUPER && udev->tunnel_mode != USB_LINK_NATIVE) usb_acpi_add_usb4_devlink(udev); /* * This is an embedded USB device connected to a port and such * devices share port's ACPI companion. */ port_dev = hub->ports[udev->portnum - 1]; return usb_acpi_get_companion_for_port(port_dev); } static struct acpi_device *usb_acpi_find_companion(struct device *dev) { /* * The USB hierarchy like following: * * Device (EHC1) * Device (HUBN) * Device (PR01) * Device (PR11) * Device (PR12) * Device (FN12) * Device (FN13) * Device (PR13) * ... * where HUBN is root hub, and PRNN are USB ports and devices * connected to them, and FNNN are individualk functions for * connected composite USB devices. PRNN and FNNN may contain * _CRS and other methods describing sideband resources for * the connected device. * * On the kernel side both root hub and embedded USB devices are * represented as instances of usb_device structure, and ports * are represented as usb_port structures, so the whole process * is split into 2 parts: finding companions for devices and * finding companions for ports. * * Note that we do not handle individual functions of composite * devices yet, for that we would need to assign companions to * devices corresponding to USB interfaces. */ if (is_usb_device(dev)) return usb_acpi_find_companion_for_device(to_usb_device(dev)); else if (is_usb_port(dev)) return usb_acpi_find_companion_for_port(to_usb_port(dev)); return NULL; } static bool usb_acpi_bus_match(struct device *dev) { return is_usb_device(dev) || is_usb_port(dev); } static struct acpi_bus_type usb_acpi_bus = { .name = "USB", .match = usb_acpi_bus_match, .find_companion = usb_acpi_find_companion, }; int usb_acpi_register(void) { return register_acpi_bus_type(&usb_acpi_bus); } void usb_acpi_unregister(void) { unregister_acpi_bus_type(&usb_acpi_bus); }
21 21 21 21 21 21 21 21 21 21 21 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2020 Anna Schumaker <Anna.Schumaker@Netapp.com> */ #include <linux/sunrpc/clnt.h> #include <linux/kobject.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtsock.h> #include "sysfs.h" struct xprt_addr { const char *addr; struct rcu_head rcu; }; static void free_xprt_addr(struct rcu_head *head) { struct xprt_addr *addr = container_of(head, struct xprt_addr, rcu); kfree(addr->addr); kfree(addr); } static struct kset *rpc_sunrpc_kset; static struct kobject *rpc_sunrpc_client_kobj, *rpc_sunrpc_xprt_switch_kobj; static void rpc_sysfs_object_release(struct kobject *kobj) { kfree(kobj); } static const struct kobj_ns_type_operations * rpc_sysfs_object_child_ns_type(const struct kobject *kobj) { return &net_ns_type_operations; } static const struct kobj_type rpc_sysfs_object_type = { .release = rpc_sysfs_object_release, .sysfs_ops = &kobj_sysfs_ops, .child_ns_type = rpc_sysfs_object_child_ns_type, }; static struct kobject *rpc_sysfs_object_alloc(const char *name, struct kset *kset, struct kobject *parent) { struct kobject *kobj; kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); if (kobj) { kobj->kset = kset; if (kobject_init_and_add(kobj, &rpc_sysfs_object_type, parent, "%s", name) == 0) return kobj; kobject_put(kobj); } return NULL; } static inline struct rpc_clnt * rpc_sysfs_client_kobj_get_clnt(struct kobject *kobj) { struct rpc_sysfs_client *c = container_of(kobj, struct rpc_sysfs_client, kobject); struct rpc_clnt *ret = c->clnt; return refcount_inc_not_zero(&ret->cl_count) ? ret : NULL; } static inline struct rpc_xprt * rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) { struct rpc_sysfs_xprt *x = container_of(kobj, struct rpc_sysfs_xprt, kobject); return xprt_get(x->xprt); } static inline struct rpc_xprt_switch * rpc_sysfs_xprt_kobj_get_xprt_switch(struct kobject *kobj) { struct rpc_sysfs_xprt *x = container_of(kobj, struct rpc_sysfs_xprt, kobject); return xprt_switch_get(x->xprt_switch); } static inline struct rpc_xprt_switch * rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj) { struct rpc_sysfs_xprt_switch *x = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject); return xprt_switch_get(x->xprt_switch); } static ssize_t rpc_sysfs_clnt_version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); ssize_t ret; if (!clnt) return sprintf(buf, "<closed>\n"); ret = sprintf(buf, "%u", clnt->cl_vers); refcount_dec(&clnt->cl_count); return ret; } static ssize_t rpc_sysfs_clnt_program_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); ssize_t ret; if (!clnt) return sprintf(buf, "<closed>\n"); ret = sprintf(buf, "%s", clnt->cl_program->name); refcount_dec(&clnt->cl_count); return ret; } static ssize_t rpc_sysfs_clnt_max_connect_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); ssize_t ret; if (!clnt) return sprintf(buf, "<closed>\n"); ret = sprintf(buf, "%u\n", clnt->cl_max_connect); refcount_dec(&clnt->cl_count); return ret; } static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; if (!xprt) { ret = sprintf(buf, "<closed>\n"); goto out; } ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]); xprt_put(xprt); out: return ret; } static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); size_t buflen = PAGE_SIZE; ssize_t ret; if (!xprt || !xprt_connected(xprt)) { ret = sprintf(buf, "<closed>\n"); } else if (xprt->ops->get_srcaddr) { ret = xprt->ops->get_srcaddr(xprt, buf, buflen); if (ret > 0) { if (ret < buflen - 1) { buf[ret] = '\n'; ret++; buf[ret] = '\0'; } } else ret = sprintf(buf, "<closed>\n"); } else ret = sprintf(buf, "<not a socket>\n"); xprt_put(xprt); return ret; } static const char *xprtsec_strings[] = { [RPC_XPRTSEC_NONE] = "none", [RPC_XPRTSEC_TLS_ANON] = "tls-anon", [RPC_XPRTSEC_TLS_X509] = "tls-x509", }; static ssize_t rpc_sysfs_xprt_xprtsec_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; if (!xprt) { ret = sprintf(buf, "<closed>\n"); goto out; } ret = sprintf(buf, "%s\n", xprtsec_strings[xprt->xprtsec.policy]); xprt_put(xprt); out: return ret; } static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); unsigned short srcport = 0; size_t buflen = PAGE_SIZE; ssize_t ret; if (!xprt || !xprt_connected(xprt)) { ret = sprintf(buf, "<closed>\n"); goto out; } if (xprt->ops->get_srcport) srcport = xprt->ops->get_srcport(xprt); ret = snprintf(buf, buflen, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n" "tasks_queuelen=%ld\ndst_port=%s\n", xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs, xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, xprt->sending.qlen, xprt->pending.qlen, xprt->backlog.qlen, xprt->main, srcport, atomic_long_read(&xprt->queuelen), xprt->address_strings[RPC_DISPLAY_PORT]); out: xprt_put(xprt); return ret; } static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; int locked, connected, connecting, close_wait, bound, binding, closing, congested, cwnd_wait, write_space, offline, remove; if (!(xprt && xprt->state)) { ret = sprintf(buf, "state=CLOSED\n"); } else { locked = test_bit(XPRT_LOCKED, &xprt->state); connected = test_bit(XPRT_CONNECTED, &xprt->state); connecting = test_bit(XPRT_CONNECTING, &xprt->state); close_wait = test_bit(XPRT_CLOSE_WAIT, &xprt->state); bound = test_bit(XPRT_BOUND, &xprt->state); binding = test_bit(XPRT_BINDING, &xprt->state); closing = test_bit(XPRT_CLOSING, &xprt->state); congested = test_bit(XPRT_CONGESTED, &xprt->state); cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state); write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state); offline = test_bit(XPRT_OFFLINE, &xprt->state); remove = test_bit(XPRT_REMOVE, &xprt->state); ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s %s\n", locked ? "LOCKED" : "", connected ? "CONNECTED" : "", connecting ? "CONNECTING" : "", close_wait ? "CLOSE_WAIT" : "", bound ? "BOUND" : "", binding ? "BOUNDING" : "", closing ? "CLOSING" : "", congested ? "CONGESTED" : "", cwnd_wait ? "CWND_WAIT" : "", write_space ? "WRITE_SPACE" : "", offline ? "OFFLINE" : "", remove ? "REMOVE" : ""); } xprt_put(xprt); return ret; } static ssize_t rpc_sysfs_xprt_del_xprt_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "# delete this xprt\n"); } static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt_switch *xprt_switch = rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); ssize_t ret; if (!xprt_switch) return 0; ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\n" "num_unique_destaddr=%u\nqueue_len=%ld\n", xprt_switch->xps_nxprts, xprt_switch->xps_nactive, xprt_switch->xps_nunique_destaddr_xprts, atomic_long_read(&xprt_switch->xps_queuelen)); xprt_switch_put(xprt_switch); return ret; } static ssize_t rpc_sysfs_xprt_switch_add_xprt_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "# add one xprt to this xprt_switch\n"); } static ssize_t rpc_sysfs_xprt_switch_add_xprt_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct rpc_xprt_switch *xprt_switch = rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); struct xprt_create xprt_create_args; struct rpc_xprt *xprt, *new; if (!xprt_switch) return 0; xprt = rpc_xprt_switch_get_main_xprt(xprt_switch); if (!xprt) goto out; xprt_create_args.ident = xprt->xprt_class->ident; xprt_create_args.net = xprt->xprt_net; xprt_create_args.dstaddr = (struct sockaddr *)&xprt->addr; xprt_create_args.addrlen = xprt->addrlen; xprt_create_args.servername = xprt->servername; xprt_create_args.bc_xprt = xprt->bc_xprt; xprt_create_args.xprtsec = xprt->xprtsec; xprt_create_args.connect_timeout = xprt->connect_timeout; xprt_create_args.reconnect_timeout = xprt->max_reconnect_timeout; new = xprt_create_transport(&xprt_create_args); if (IS_ERR_OR_NULL(new)) { count = PTR_ERR(new); goto out_put_xprt; } rpc_xprt_switch_add_xprt(xprt_switch, new); xprt_put(new); out_put_xprt: xprt_put(xprt); out: xprt_switch_put(xprt_switch); return count; } static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); struct sockaddr *saddr; char *dst_addr; int port; struct xprt_addr *saved_addr; size_t buf_len; if (!xprt) return 0; if (!(xprt->xprt_class->ident == XPRT_TRANSPORT_TCP || xprt->xprt_class->ident == XPRT_TRANSPORT_TCP_TLS || xprt->xprt_class->ident == XPRT_TRANSPORT_RDMA)) { xprt_put(xprt); return -EOPNOTSUPP; } if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { count = -EINTR; goto out_put; } saddr = (struct sockaddr *)&xprt->addr; port = rpc_get_port(saddr); /* buf_len is the len until the first occurence of either * '\n' or '\0' */ buf_len = strcspn(buf, "\n"); dst_addr = kstrndup(buf, buf_len, GFP_KERNEL); if (!dst_addr) goto out_err; saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL); if (!saved_addr) goto out_err_free; saved_addr->addr = rcu_dereference_raw(xprt->address_strings[RPC_DISPLAY_ADDR]); rcu_assign_pointer(xprt->address_strings[RPC_DISPLAY_ADDR], dst_addr); call_rcu(&saved_addr->rcu, free_xprt_addr); xprt->addrlen = rpc_pton(xprt->xprt_net, buf, buf_len, saddr, sizeof(*saddr)); rpc_set_port(saddr, port); xprt_force_disconnect(xprt); out: xprt_release_write(xprt, NULL); out_put: xprt_put(xprt); return count; out_err_free: kfree(dst_addr); out_err: count = -ENOMEM; goto out; } static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); int offline = 0, online = 0, remove = 0; struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); if (!xprt || !xps) { count = 0; goto out_put; } if (!strncmp(buf, "offline", 7)) offline = 1; else if (!strncmp(buf, "online", 6)) online = 1; else if (!strncmp(buf, "remove", 6)) remove = 1; else { count = -EINVAL; goto out_put; } if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { count = -EINTR; goto out_put; } if (xprt->main) { count = -EINVAL; goto release_tasks; } if (offline) { xprt_set_offline_locked(xprt, xps); } else if (online) { xprt_set_online_locked(xprt, xps); } else if (remove) { if (test_bit(XPRT_OFFLINE, &xprt->state)) xprt_delete_locked(xprt, xps); else count = -EINVAL; } release_tasks: xprt_release_write(xprt, NULL); out_put: xprt_put(xprt); xprt_switch_put(xps); return count; } static ssize_t rpc_sysfs_xprt_del_xprt(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); if (!xprt || !xps) { count = 0; goto out; } if (xprt->main) { count = -EINVAL; goto release_tasks; } if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { count = -EINTR; goto out_put; } xprt_set_offline_locked(xprt, xps); xprt_delete_locked(xprt, xps); release_tasks: xprt_release_write(xprt, NULL); out_put: xprt_put(xprt); xprt_switch_put(xps); out: return count; } int rpc_sysfs_init(void) { rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); if (!rpc_sunrpc_kset) return -ENOMEM; rpc_sunrpc_client_kobj = rpc_sysfs_object_alloc("rpc-clients", rpc_sunrpc_kset, NULL); if (!rpc_sunrpc_client_kobj) goto err_client; rpc_sunrpc_xprt_switch_kobj = rpc_sysfs_object_alloc("xprt-switches", rpc_sunrpc_kset, NULL); if (!rpc_sunrpc_xprt_switch_kobj) goto err_switch; return 0; err_switch: kobject_put(rpc_sunrpc_client_kobj); rpc_sunrpc_client_kobj = NULL; err_client: kset_unregister(rpc_sunrpc_kset); rpc_sunrpc_kset = NULL; return -ENOMEM; } static void rpc_sysfs_client_release(struct kobject *kobj) { struct rpc_sysfs_client *c; c = container_of(kobj, struct rpc_sysfs_client, kobject); kfree(c); } static void rpc_sysfs_xprt_switch_release(struct kobject *kobj) { struct rpc_sysfs_xprt_switch *xprt_switch; xprt_switch = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject); kfree(xprt_switch); } static void rpc_sysfs_xprt_release(struct kobject *kobj) { struct rpc_sysfs_xprt *xprt; xprt = container_of(kobj, struct rpc_sysfs_xprt, kobject); kfree(xprt); } static const void *rpc_sysfs_client_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_client, kobject)->net; } static const void *rpc_sysfs_xprt_switch_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net; } static const void *rpc_sysfs_xprt_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_xprt, kobject)->xprt->xprt_net; } static struct kobj_attribute rpc_sysfs_clnt_version = __ATTR(rpc_version, 0444, rpc_sysfs_clnt_version_show, NULL); static struct kobj_attribute rpc_sysfs_clnt_program = __ATTR(program, 0444, rpc_sysfs_clnt_program_show, NULL); static struct kobj_attribute rpc_sysfs_clnt_max_connect = __ATTR(max_connect, 0444, rpc_sysfs_clnt_max_connect_show, NULL); static struct attribute *rpc_sysfs_rpc_clnt_attrs[] = { &rpc_sysfs_clnt_version.attr, &rpc_sysfs_clnt_program.attr, &rpc_sysfs_clnt_max_connect.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_rpc_clnt); static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr, 0644, rpc_sysfs_xprt_srcaddr_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_xprtsec = __ATTR(xprtsec, 0644, rpc_sysfs_xprt_xprtsec_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, 0444, rpc_sysfs_xprt_info_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, 0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change); static struct kobj_attribute rpc_sysfs_xprt_del = __ATTR(del_xprt, 0644, rpc_sysfs_xprt_del_xprt_show, rpc_sysfs_xprt_del_xprt); static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, &rpc_sysfs_xprt_srcaddr.attr, &rpc_sysfs_xprt_xprtsec.attr, &rpc_sysfs_xprt_info.attr, &rpc_sysfs_xprt_change_state.attr, &rpc_sysfs_xprt_del.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt); static struct kobj_attribute rpc_sysfs_xprt_switch_info = __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_switch_add_xprt = __ATTR(add_xprt, 0644, rpc_sysfs_xprt_switch_add_xprt_show, rpc_sysfs_xprt_switch_add_xprt_store); static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { &rpc_sysfs_xprt_switch_info.attr, &rpc_sysfs_xprt_switch_add_xprt.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch); static const struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, .default_groups = rpc_sysfs_rpc_clnt_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_client_namespace, }; static const struct kobj_type rpc_sysfs_xprt_switch_type = { .release = rpc_sysfs_xprt_switch_release, .default_groups = rpc_sysfs_xprt_switch_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_switch_namespace, }; static const struct kobj_type rpc_sysfs_xprt_type = { .release = rpc_sysfs_xprt_release, .default_groups = rpc_sysfs_xprt_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_namespace, }; void rpc_sysfs_exit(void) { kobject_put(rpc_sunrpc_client_kobj); kobject_put(rpc_sunrpc_xprt_switch_kobj); kset_unregister(rpc_sunrpc_kset); } static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent, struct net *net, int clid) { struct rpc_sysfs_client *p; p = kzalloc(sizeof(*p), GFP_KERNEL); if (p) { p->net = net; p->kobject.kset = rpc_sunrpc_kset; if (kobject_init_and_add(&p->kobject, &rpc_sysfs_client_type, parent, "clnt-%d", clid) == 0) return p; kobject_put(&p->kobject); } return NULL; } static struct rpc_sysfs_xprt_switch * rpc_sysfs_xprt_switch_alloc(struct kobject *parent, struct rpc_xprt_switch *xprt_switch, struct net *net, gfp_t gfp_flags) { struct rpc_sysfs_xprt_switch *p; p = kzalloc(sizeof(*p), gfp_flags); if (p) { p->net = net; p->kobject.kset = rpc_sunrpc_kset; if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_switch_type, parent, "switch-%d", xprt_switch->xps_id) == 0) return p; kobject_put(&p->kobject); } return NULL; } static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent, struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_sysfs_xprt *p; p = kzalloc(sizeof(*p), gfp_flags); if (!p) goto out; p->kobject.kset = rpc_sunrpc_kset; if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_type, parent, "xprt-%d-%s", xprt->id, xprt->address_strings[RPC_DISPLAY_PROTO]) == 0) return p; kobject_put(&p->kobject); out: return NULL; } void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct rpc_xprt_switch *xprt_switch, struct net *net) { struct rpc_sysfs_client *rpc_client; struct rpc_sysfs_xprt_switch *xswitch = (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; if (!xswitch) return; rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj, net, clnt->cl_clid); if (rpc_client) { char name[] = "switch"; int ret; clnt->cl_sysfs = rpc_client; rpc_client->clnt = clnt; rpc_client->xprt_switch = xprt_switch; kobject_uevent(&rpc_client->kobject, KOBJ_ADD); ret = sysfs_create_link_nowarn(&rpc_client->kobject, &xswitch->kobject, name); if (ret) pr_warn("can't create link to %s in sysfs (%d)\n", name, ret); } } void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_sysfs_xprt_switch *rpc_xprt_switch; struct net *net; if (xprt_switch->xps_net) net = xprt_switch->xps_net; else net = xprt->xprt_net; rpc_xprt_switch = rpc_sysfs_xprt_switch_alloc(rpc_sunrpc_xprt_switch_kobj, xprt_switch, net, gfp_flags); if (rpc_xprt_switch) { xprt_switch->xps_sysfs = rpc_xprt_switch; rpc_xprt_switch->xprt_switch = xprt_switch; rpc_xprt_switch->xprt = xprt; kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD); } else { xprt_switch->xps_sysfs = NULL; } } void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_sysfs_xprt *rpc_xprt; struct rpc_sysfs_xprt_switch *switch_obj = (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; if (!switch_obj) return; rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags); if (rpc_xprt) { xprt->xprt_sysfs = rpc_xprt; rpc_xprt->xprt = xprt; rpc_xprt->xprt_switch = xprt_switch; kobject_uevent(&rpc_xprt->kobject, KOBJ_ADD); } } void rpc_sysfs_client_destroy(struct rpc_clnt *clnt) { struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs; if (rpc_client) { char name[] = "switch"; sysfs_remove_link(&rpc_client->kobject, name); kobject_uevent(&rpc_client->kobject, KOBJ_REMOVE); kobject_del(&rpc_client->kobject); kobject_put(&rpc_client->kobject); clnt->cl_sysfs = NULL; } } void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt_switch) { struct rpc_sysfs_xprt_switch *rpc_xprt_switch = xprt_switch->xps_sysfs; if (rpc_xprt_switch) { kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_REMOVE); kobject_del(&rpc_xprt_switch->kobject); kobject_put(&rpc_xprt_switch->kobject); xprt_switch->xps_sysfs = NULL; } } void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt) { struct rpc_sysfs_xprt *rpc_xprt = xprt->xprt_sysfs; if (rpc_xprt) { kobject_uevent(&rpc_xprt->kobject, KOBJ_REMOVE); kobject_del(&rpc_xprt->kobject); kobject_put(&rpc_xprt->kobject); xprt->xprt_sysfs = NULL; } }
21 1 86 2 151 134 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 /* SPDX-License-Identifier: GPL-2.0 */ /* * Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #ifndef __NET_MPTCP_H #define __NET_MPTCP_H #include <linux/skbuff.h> #include <linux/tcp.h> #include <linux/types.h> struct mptcp_info; struct mptcp_sock; struct mptcp_pm_addr_entry; struct seq_file; /* MPTCP sk_buff extension data */ struct mptcp_ext { union { u64 data_ack; u32 data_ack32; }; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; u8 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, frozen:1, reset_transient:1; u8 reset_reason:4, csum_reqd:1, infinite_map:1; }; #define MPTCPOPT_HMAC_LEN 20 #define MPTCP_RM_IDS_MAX 8 struct mptcp_rm_list { u8 ids[MPTCP_RM_IDS_MAX]; u8 nr; }; struct mptcp_addr_info { u8 id; sa_family_t family; __be16 port; union { struct in_addr addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct in6_addr addr6; #endif }; }; struct mptcp_out_options { #if IS_ENABLED(CONFIG_MPTCP) u16 suboptions; struct mptcp_rm_list rm_list; u8 join_id; u8 backup; u8 reset_reason:4, reset_transient:1, csum_reqd:1, allow_join_id0:1; union { struct { u64 sndr_key; u64 rcvr_key; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; }; struct { struct mptcp_addr_info addr; u64 ahmac; }; struct { struct mptcp_ext ext_copy; u64 fail_seq; }; struct { u32 nonce; u32 token; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; }; }; #endif }; #define MPTCP_SCHED_NAME_MAX 16 #define MPTCP_SCHED_MAX 128 #define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) #define MPTCP_SUBFLOWS_MAX 8 struct mptcp_sched_data { u8 subflows; struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; }; struct mptcp_sched_ops { int (*get_send)(struct mptcp_sock *msk, struct mptcp_sched_data *data); int (*get_retrans)(struct mptcp_sock *msk, struct mptcp_sched_data *data); char name[MPTCP_SCHED_NAME_MAX]; struct module *owner; struct list_head list; void (*init)(struct mptcp_sock *msk); void (*release)(struct mptcp_sock *msk); } ____cacheline_aligned_in_smp; #define MPTCP_PM_NAME_MAX 16 #define MPTCP_PM_MAX 128 #define MPTCP_PM_BUF_MAX (MPTCP_PM_NAME_MAX * MPTCP_PM_MAX) struct mptcp_pm_ops { char name[MPTCP_PM_NAME_MAX]; struct module *owner; struct list_head list; void (*init)(struct mptcp_sock *msk); void (*release)(struct mptcp_sock *msk); } ____cacheline_aligned_in_smp; #ifdef CONFIG_MPTCP void mptcp_init(void); static inline bool sk_is_mptcp(const struct sock *sk) { return tcp_sk(sk)->is_mptcp; } static inline bool rsk_is_mptcp(const struct request_sock *req) { return tcp_rsk(req)->is_mptcp; } static inline bool rsk_drop_req(const struct request_sock *req) { return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req; } void mptcp_space(const struct sock *ssk, int *space, int *full_space); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts); bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts); void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info); /* move the skb extension owership, with the assumption that 'to' is * newly allocated */ static inline void mptcp_skb_ext_move(struct sk_buff *to, struct sk_buff *from) { if (!skb_ext_exist(from, SKB_EXT_MPTCP)) return; if (WARN_ON_ONCE(to->active_extensions)) skb_ext_put(to); to->active_extensions = from->active_extensions; to->extensions = from->extensions; from->active_extensions = 0; } static inline void mptcp_skb_ext_copy(struct sk_buff *to, struct sk_buff *from) { struct mptcp_ext *from_ext; from_ext = skb_ext_find(from, SKB_EXT_MPTCP); if (!from_ext) return; from_ext->frozen = 1; skb_ext_copy(to, from); } static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, const struct mptcp_ext *from_ext) { /* MPTCP always clears the ext when adding it to the skb, so * holes do not bother us here */ return !from_ext || (to_ext && from_ext && !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext))); } /* check if skbs can be collapsed. * MPTCP collapse is allowed if neither @to or @from carry an mptcp data * mapping, or if the extension of @to is the same as @from. * Collapsing is not possible if @to lacks an extension, but @from carries one. */ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP), skb_ext_find(from, SKB_EXT_MPTCP)); } void mptcp_seq_show(struct seq_file *seq); int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener); __be32 mptcp_get_reset_option(const struct sk_buff *skb); static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { if (skb_ext_exist(skb, SKB_EXT_MPTCP)) return mptcp_get_reset_option(skb); return htonl(0u); } void mptcp_active_detect_blackhole(struct sock *sk, bool expired); #else static inline void mptcp_init(void) { } static inline bool sk_is_mptcp(const struct sock *sk) { return false; } static inline bool rsk_is_mptcp(const struct request_sock *req) { return false; } static inline bool rsk_drop_req(const struct request_sock *req) { return false; } static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { return true; } static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { } static inline void mptcp_skb_ext_copy(struct sk_buff *to, struct sk_buff *from) { } static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return true; } static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { } static inline void mptcp_seq_show(struct seq_file *seq) { } static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { return 0; /* TCP fallback */ } static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { return NULL; } static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } static inline void mptcp_active_detect_blackhole(struct sock *sk, bool expired) { } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) int mptcpv6_init(void); void mptcpv6_handle_mapped(struct sock *sk, bool mapped); #elif IS_ENABLED(CONFIG_IPV6) static inline int mptcpv6_init(void) { return 0; } static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { } #endif #if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL) struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk); #else static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; } #endif #if !IS_ENABLED(CONFIG_MPTCP) struct mptcp_sock { }; #endif #endif /* __NET_MPTCP_H */
47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * * Authors: * Mimi Zohar <zohar@us.ibm.com> * * File: integrity_iint.c * - initialize the integrity directory in securityfs * - load IMA and EVM keys */ #include <linux/security.h> #include "integrity.h" struct dentry *integrity_dir; /* * integrity_kernel_read - read data from the file * * This is a function for reading file content instead of kernel_read(). * It does not perform locking checks to ensure it cannot be blocked. * It does not perform security checks because it is irrelevant for IMA. * */ int integrity_kernel_read(struct file *file, loff_t offset, void *addr, unsigned long count) { return __kernel_read(file, addr, count, &offset); } /* * integrity_load_keys - load integrity keys hook * * Hooks is called from init/main.c:kernel_init_freeable() * when rootfs is ready */ void __init integrity_load_keys(void) { ima_load_x509(); if (!IS_ENABLED(CONFIG_IMA_LOAD_X509)) evm_load_x509(); } static int __init integrity_fs_init(void) { integrity_dir = securityfs_create_dir("integrity", NULL); if (IS_ERR(integrity_dir)) { int ret = PTR_ERR(integrity_dir); if (ret != -ENODEV) pr_err("Unable to create integrity sysfs dir: %d\n", ret); integrity_dir = NULL; return ret; } return 0; } late_initcall(integrity_fs_init)
16 15 16 16 3 16 15 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-only /* * Network Service Header * * Copyright (c) 2017 Red Hat, Inc. -- Jiri Benc <jbenc@redhat.com> */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/gso.h> #include <net/nsh.h> #include <net/tun_proto.h> int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh) { struct nshhdr *nh; size_t length = nsh_hdr_len(pushed_nh); u8 next_proto; if (skb->mac_len) { next_proto = TUN_P_ETHERNET; } else { next_proto = tun_p_from_eth_p(skb->protocol); if (!next_proto) return -EAFNOSUPPORT; } /* Add the NSH header */ if (skb_cow_head(skb, length) < 0) return -ENOMEM; skb_push(skb, length); nh = (struct nshhdr *)(skb->data); memcpy(nh, pushed_nh, length); nh->np = next_proto; skb_postpush_rcsum(skb, nh, length); skb->protocol = htons(ETH_P_NSH); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_mac_len(skb); return 0; } EXPORT_SYMBOL_GPL(nsh_push); int nsh_pop(struct sk_buff *skb) { struct nshhdr *nh; size_t length; __be16 inner_proto; if (!pskb_may_pull(skb, NSH_BASE_HDR_LEN)) return -ENOMEM; nh = (struct nshhdr *)(skb->data); length = nsh_hdr_len(nh); if (length < NSH_BASE_HDR_LEN) return -EINVAL; inner_proto = tun_p_to_eth_p(nh->np); if (!pskb_may_pull(skb, length)) return -ENOMEM; if (!inner_proto) return -EAFNOSUPPORT; skb_pull_rcsum(skb, length); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_mac_len(skb); skb->protocol = inner_proto; return 0; } EXPORT_SYMBOL_GPL(nsh_pop); static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, netdev_features_t features) { unsigned int outer_hlen, mac_len, nsh_len; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 outer_proto, proto; skb_reset_network_header(skb); outer_proto = skb->protocol; outer_hlen = skb_mac_header_len(skb); mac_len = skb->mac_len; if (unlikely(!pskb_may_pull(skb, NSH_BASE_HDR_LEN))) goto out; nsh_len = nsh_hdr_len(nsh_hdr(skb)); if (nsh_len < NSH_BASE_HDR_LEN) goto out; if (unlikely(!pskb_may_pull(skb, nsh_len))) goto out; proto = tun_p_to_eth_p(nsh_hdr(skb)->np); if (!proto) goto out; __skb_pull(skb, nsh_len); skb_reset_mac_header(skb); skb->mac_len = proto == htons(ETH_P_TEB) ? ETH_HLEN : 0; skb->protocol = proto; features &= NETIF_F_SG; segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, htons(ETH_P_NSH), nsh_len, mac_offset, mac_len); goto out; } for (skb = segs; skb; skb = skb->next) { skb->protocol = outer_proto; __skb_push(skb, nsh_len + outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, outer_hlen); skb->mac_len = mac_len; } out: return segs; } static struct packet_offload nsh_packet_offload __read_mostly = { .type = htons(ETH_P_NSH), .priority = 15, .callbacks = { .gso_segment = nsh_gso_segment, }, }; static int __init nsh_init_module(void) { dev_add_offload(&nsh_packet_offload); return 0; } static void __exit nsh_cleanup_module(void) { dev_remove_offload(&nsh_packet_offload); } module_init(nsh_init_module); module_exit(nsh_cleanup_module); MODULE_AUTHOR("Jiri Benc <jbenc@redhat.com>"); MODULE_DESCRIPTION("NSH protocol"); MODULE_LICENSE("GPL v2");
12 12 12 12 12 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to sysfs handling */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/blktrace_api.h> #include <linux/debugfs.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-wbt.h" #include "blk-cgroup.h" #include "blk-throttle.h" struct queue_sysfs_entry { struct attribute attr; ssize_t (*show)(struct gendisk *disk, char *page); ssize_t (*show_limit)(struct gendisk *disk, char *page); ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); int (*store_limit)(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim); }; static ssize_t queue_var_show(unsigned long var, char *page) { return sysfs_emit(page, "%lu\n", var); } static ssize_t queue_var_store(unsigned long *var, const char *page, size_t count) { int err; unsigned long v; err = kstrtoul(page, 10, &v); if (err || v > UINT_MAX) return -EINVAL; *var = v; return count; } static ssize_t queue_requests_show(struct gendisk *disk, char *page) { ssize_t ret; mutex_lock(&disk->queue->elevator_lock); ret = queue_var_show(disk->queue->nr_requests, page); mutex_unlock(&disk->queue->elevator_lock); return ret; } static ssize_t queue_requests_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nr; int ret, err; unsigned int memflags; struct request_queue *q = disk->queue; if (!queue_is_mq(q)) return -EINVAL; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; memflags = blk_mq_freeze_queue(q); mutex_lock(&q->elevator_lock); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; err = blk_mq_update_nr_requests(disk->queue, nr); if (err) ret = err; mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); return ret; } static ssize_t queue_ra_show(struct gendisk *disk, char *page) { ssize_t ret; mutex_lock(&disk->queue->limits_lock); ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); mutex_unlock(&disk->queue->limits_lock); return ret; } static ssize_t queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; unsigned int memflags; struct request_queue *q = disk->queue; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; /* * ->ra_pages is protected by ->limits_lock because it is usually * calculated from the queue limits by queue_limits_commit_update. */ mutex_lock(&q->limits_lock); memflags = blk_mq_freeze_queue(q); disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); mutex_unlock(&q->limits_lock); blk_mq_unfreeze_queue(q, memflags); return ret; } #define QUEUE_SYSFS_LIMIT_SHOW(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return queue_var_show(disk->queue->limits._field, page); \ } QUEUE_SYSFS_LIMIT_SHOW(max_segments) QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) QUEUE_SYSFS_LIMIT_SHOW(io_min) QUEUE_SYSFS_LIMIT_SHOW(io_opt) QUEUE_SYSFS_LIMIT_SHOW(discard_granularity) QUEUE_SYSFS_LIMIT_SHOW(zone_write_granularity) QUEUE_SYSFS_LIMIT_SHOW(virt_boundary_mask) QUEUE_SYSFS_LIMIT_SHOW(dma_alignment) QUEUE_SYSFS_LIMIT_SHOW(max_open_zones) QUEUE_SYSFS_LIMIT_SHOW(max_active_zones) QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_min) QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%llu\n", \ (unsigned long long)disk->queue->limits._field << \ SECTOR_SHIFT); \ } QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return queue_var_show(disk->queue->limits._field >> 1, page); \ } QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) #define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%d\n", _val); \ } /* deprecated fields */ QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0) QUEUE_SYSFS_SHOW_CONST(write_same_max, 0) QUEUE_SYSFS_SHOW_CONST(poll_delay, -1) static int queue_max_discard_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long max_discard_bytes; ssize_t ret; ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) return ret; if (max_discard_bytes & (disk->queue->limits.discard_granularity - 1)) return -EINVAL; if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; return 0; } static int queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long max_sectors_kb; ssize_t ret; ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; lim->max_user_sectors = max_sectors_kb << 1; return 0; } static ssize_t queue_feature_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim, blk_features_t feature) { unsigned long val; ssize_t ret; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; if (val) lim->features |= feature; else lim->features &= ~feature; return 0; } #define QUEUE_SYSFS_FEATURE(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } \ static int queue_##_name##_store(struct gendisk *disk, \ const char *page, size_t count, struct queue_limits *lim) \ { \ return queue_feature_store(disk, page, count, lim, _feature); \ } QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT) QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); #define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); static ssize_t queue_poll_show(struct gendisk *disk, char *page) { if (queue_is_mq(disk->queue)) return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); return sysfs_emit(page, "%u\n", !!(disk->queue->limits.features & BLK_FEAT_POLL)); } static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { if (blk_queue_is_zoned(disk->queue)) return sysfs_emit(page, "host-managed\n"); return sysfs_emit(page, "none\n"); } static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) { return queue_var_show(disk_nr_zones(disk), page); } static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); } static int queue_iostats_passthrough_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long ios; ssize_t ret; ret = queue_var_store(&ios, page, count); if (ret < 0) return ret; if (ios) lim->flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; else lim->flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; return 0; } static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | blk_queue_noxmerges(disk->queue), page); } static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; unsigned int memflags; struct request_queue *q = disk->queue; ssize_t ret = queue_var_store(&nm, page, count); if (ret < 0) return ret; memflags = blk_mq_freeze_queue(q); blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); blk_mq_unfreeze_queue(q, memflags); return ret; } static ssize_t queue_rq_affinity_show(struct gendisk *disk, char *page) { bool set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); return queue_var_show(set << force, page); } static ssize_t queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) { ssize_t ret = -EINVAL; #ifdef CONFIG_SMP struct request_queue *q = disk->queue; unsigned long val; unsigned int memflags; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; /* * Here we update two queue flags each using atomic bitops, although * updating two flags isn't atomic it should be harmless as those flags * are accessed individually using atomic test_bit operation. So we * don't grab any lock while updating these flags. */ memflags = blk_mq_freeze_queue(q); if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 1) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 0) { blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } blk_mq_unfreeze_queue(q, memflags); #endif return ret; } static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, size_t count) { return count; } static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { unsigned int memflags; ssize_t ret = count; struct request_queue *q = disk->queue; memflags = blk_mq_freeze_queue(q); if (!(q->limits.features & BLK_FEAT_POLL)) { ret = -EINVAL; goto out; } pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); out: blk_mq_unfreeze_queue(q, memflags); return ret; } static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { return sysfs_emit(page, "%u\n", jiffies_to_msecs(READ_ONCE(disk->queue->rq_timeout))); } static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { unsigned int val, memflags; int err; struct request_queue *q = disk->queue; err = kstrtou32(page, 10, &val); if (err || val == 0) return -EINVAL; memflags = blk_mq_freeze_queue(q); blk_queue_rq_timeout(q, msecs_to_jiffies(val)); blk_mq_unfreeze_queue(q, memflags); return count; } static ssize_t queue_wc_show(struct gendisk *disk, char *page) { if (blk_queue_write_cache(disk->queue)) return sysfs_emit(page, "write back\n"); return sysfs_emit(page, "write through\n"); } static int queue_wc_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { bool disable; if (!strncmp(page, "write back", 10)) { disable = false; } else if (!strncmp(page, "write through", 13) || !strncmp(page, "none", 4)) { disable = true; } else { return -EINVAL; } if (disable) lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED; else lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; return 0; } #define QUEUE_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ .show = _prefix##_show, \ }; #define QUEUE_RW_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show = _prefix##_show, \ .store = _prefix##_store, \ }; #define QUEUE_LIM_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ .show_limit = _prefix##_show, \ } #define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show_limit = _prefix##_show, \ .store_limit = _prefix##_store, \ } QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); QUEUE_RW_ENTRY(elv_iosched, "scheduler"); QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size"); QUEUE_LIM_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size"); QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size"); QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity"); QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors, "atomic_write_boundary_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache"); QUEUE_LIM_RO_ENTRY(queue_fua, "fua"); QUEUE_LIM_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); /* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .attr = {.name = "hw_sector_size", .mode = 0444 }, .show_limit = queue_logical_block_size_show, }; QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational"); QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats"); QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random"); QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes"); #ifdef CONFIG_BLK_WBT static ssize_t queue_var_store64(s64 *var, const char *page) { int err; s64 v; err = kstrtos64(page, 10, &v); if (err < 0) return err; *var = v; return 0; } static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) { ssize_t ret; struct request_queue *q = disk->queue; mutex_lock(&q->elevator_lock); if (!wbt_rq_qos(q)) { ret = -EINVAL; goto out; } if (wbt_disabled(q)) { ret = sysfs_emit(page, "0\n"); goto out; } ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); out: mutex_unlock(&q->elevator_lock); return ret; } static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, size_t count) { struct request_queue *q = disk->queue; struct rq_qos *rqos; ssize_t ret; s64 val; unsigned int memflags; ret = queue_var_store64(&val, page); if (ret < 0) return ret; if (val < -1) return -EINVAL; memflags = blk_mq_freeze_queue(q); mutex_lock(&q->elevator_lock); rqos = wbt_rq_qos(q); if (!rqos) { ret = wbt_init(disk); if (ret) goto out; } ret = count; if (val == -1) val = wbt_default_latency_nsec(q); else if (val >= 0) val *= 1000ULL; if (wbt_get_min_lat(q) == val) goto out; /* * Ensure that the queue is idled, in case the latency update * ends up either enabling or disabling wbt completely. We can't * have IO inflight if that happens. */ blk_mq_quiesce_queue(q); wbt_set_min_lat(q, val); blk_mq_unquiesce_queue(q); out: mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); return ret; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); #endif /* Common attributes for bio-based and request-based queues. */ static struct attribute *queue_attrs[] = { /* * Attributes which are protected with q->limits_lock. */ &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, &queue_chunk_sectors_entry.attr, &queue_io_min_entry.attr, &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, &queue_max_discard_sectors_entry.attr, &queue_max_hw_discard_sectors_entry.attr, &queue_atomic_write_max_sectors_entry.attr, &queue_atomic_write_boundary_sectors_entry.attr, &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, &queue_iostats_passthrough_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_add_random_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, &queue_ra_entry.attr, /* * Attributes which don't require locking. */ &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_nr_zones_entry.attr, &queue_nomerges_entry.attr, &queue_poll_entry.attr, &queue_poll_delay_entry.attr, NULL, }; /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { /* * Attributes which require some form of locking other than * q->sysfs_lock. */ &elv_iosched_entry.attr, &queue_requests_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif /* * Attributes which don't require locking. */ &queue_rq_affinity_entry.attr, &queue_io_timeout_entry.attr, NULL, }; static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if ((attr == &queue_max_open_zones_entry.attr || attr == &queue_max_active_zones_entry.attr) && !blk_queue_is_zoned(q)) return 0; return attr->mode; } static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if (!queue_is_mq(q)) return 0; if (attr == &queue_io_timeout_entry.attr && !q->mq_ops->timeout) return 0; return attr->mode; } static struct attribute_group queue_attr_group = { .attrs = queue_attrs, .is_visible = queue_attr_visible, }; static struct attribute_group blk_mq_queue_attr_group = { .attrs = blk_mq_queue_attrs, .is_visible = blk_mq_queue_attr_visible, }; #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); if (!entry->show && !entry->show_limit) return -EIO; if (entry->show_limit) { ssize_t res; mutex_lock(&disk->queue->limits_lock); res = entry->show_limit(disk, page); mutex_unlock(&disk->queue->limits_lock); return res; } return entry->show(disk, page); } static ssize_t queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if (!entry->store_limit && !entry->store) return -EIO; if (entry->store_limit) { ssize_t res; struct queue_limits lim = queue_limits_start_update(q); res = entry->store_limit(disk, page, length, &lim); if (res < 0) { queue_limits_cancel_update(q); return res; } res = queue_limits_commit_update_frozen(q, &lim); if (res) return res; return length; } return entry->store(disk, page, length); } static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, }; static const struct attribute_group *blk_queue_attr_groups[] = { &queue_attr_group, &blk_mq_queue_attr_group, NULL }; static void blk_queue_release(struct kobject *kobj) { /* nothing to do here, all data is associated with the parent gendisk */ } static const struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, .release = blk_queue_release, }; static void blk_debugfs_remove(struct gendisk *disk) { struct request_queue *q = disk->queue; mutex_lock(&q->debugfs_mutex); blk_trace_shutdown(q); debugfs_remove_recursive(q->debugfs_dir); q->debugfs_dir = NULL; q->sched_debugfs_dir = NULL; q->rqos_debugfs_dir = NULL; mutex_unlock(&q->debugfs_mutex); } /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. */ int blk_register_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; int ret; kobject_init(&disk->queue_kobj, &blk_queue_ktype); ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) goto out_put_queue_kobj; if (queue_is_mq(q)) { ret = blk_mq_sysfs_register(disk); if (ret) goto out_put_queue_kobj; } mutex_lock(&q->sysfs_lock); mutex_lock(&q->debugfs_mutex); q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root); if (queue_is_mq(q)) blk_mq_debugfs_register(q); mutex_unlock(&q->debugfs_mutex); ret = disk_register_independent_access_ranges(disk); if (ret) goto out_debugfs_remove; ret = blk_crypto_sysfs_register(disk); if (ret) goto out_unregister_ia_ranges; mutex_lock(&q->elevator_lock); if (q->elevator) { ret = elv_register_queue(q, false); if (ret) { mutex_unlock(&q->elevator_lock); goto out_crypto_sysfs_unregister; } } wbt_enable_default(disk); mutex_unlock(&q->elevator_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); if (q->elevator) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); /* * SCSI probing may synchronously create and destroy a lot of * request_queues for non-existent devices. Shutting down a fully * functional queue takes measureable wallclock time as RCU grace * periods are involved. To avoid excessive latency in these * cases, a request_queue starts out in a degraded mode which is * faster to shut down and is made fully functional here as * request_queues for non-existent devices never get registered. */ blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); percpu_ref_switch_to_percpu(&q->q_usage_counter); return ret; out_crypto_sysfs_unregister: blk_crypto_sysfs_unregister(disk); out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); out_put_queue_kobj: kobject_put(&disk->queue_kobj); return ret; } /** * blk_unregister_queue - counterpart of blk_register_queue() * @disk: Disk of which the request queue should be unregistered from sysfs. * * Note: the caller is responsible for guaranteeing that this function is called * after blk_register_queue() has finished. */ void blk_unregister_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; if (WARN_ON(!q)) return; /* Return early if disk->queue was never registered. */ if (!blk_queue_registered(q)) return; /* * Since sysfs_remove_dir() prevents adding new directory entries * before removal of existing entries starts, protect against * concurrent elv_iosched_store() calls. */ mutex_lock(&q->sysfs_lock); blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); mutex_unlock(&q->sysfs_lock); /* * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(disk); mutex_lock(&q->elevator_lock); elv_unregister_queue(q); mutex_unlock(&q->elevator_lock); mutex_lock(&q->sysfs_lock); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); /* Now that we've deleted all child objects, we can delete the queue. */ kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); kobject_del(&disk->queue_kobj); blk_debugfs_remove(disk); }
2 1 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 // SPDX-License-Identifier: GPL-2.0-or-later /* A network driver using virtio. * * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation */ //#define DEBUG #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/module.h> #include <linux/virtio.h> #include <linux/virtio_net.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/scatterlist.h> #include <linux/if_vlan.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/average.h> #include <linux/filter.h> #include <linux/kernel.h> #include <linux/dim.h> #include <net/route.h> #include <net/xdp.h> #include <net/net_failover.h> #include <net/netdev_rx_queue.h> #include <net/netdev_queues.h> #include <net/xdp_sock_drv.h> static int napi_weight = NAPI_POLL_WEIGHT; module_param(napi_weight, int, 0444); static bool csum = true, gso = true, napi_tx = true; module_param(csum, bool, 0444); module_param(gso, bool, 0444); module_param(napi_tx, bool, 0644); /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define GOOD_COPY_LEN 128 #define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* Separating two types of XDP xmit */ #define VIRTIO_XDP_TX BIT(0) #define VIRTIO_XDP_REDIR BIT(1) /* RX packet size EWMA. The average packet size is used to determine the packet * buffer size when refilling RX rings. As the entire RX ring may be refilled * at once, the weight is chosen so that the EWMA will be insensitive to short- * term, transient changes in packet size. */ DECLARE_EWMA(pkt_len, 0, 64) #define VIRTNET_DRIVER_VERSION "1.0.0" static const unsigned long guest_offloads[] = { VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_USO4, VIRTIO_NET_F_GUEST_USO6, VIRTIO_NET_F_GUEST_HDRLEN }; #define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ (1ULL << VIRTIO_NET_F_GUEST_UFO) | \ (1ULL << VIRTIO_NET_F_GUEST_USO4) | \ (1ULL << VIRTIO_NET_F_GUEST_USO6)) struct virtnet_stat_desc { char desc[ETH_GSTRING_LEN]; size_t offset; size_t qstat_offset; }; struct virtnet_sq_free_stats { u64 packets; u64 bytes; u64 napi_packets; u64 napi_bytes; u64 xsk; }; struct virtnet_sq_stats { struct u64_stats_sync syncp; u64_stats_t packets; u64_stats_t bytes; u64_stats_t xdp_tx; u64_stats_t xdp_tx_drops; u64_stats_t kicks; u64_stats_t tx_timeouts; u64_stats_t stop; u64_stats_t wake; }; struct virtnet_rq_stats { struct u64_stats_sync syncp; u64_stats_t packets; u64_stats_t bytes; u64_stats_t drops; u64_stats_t xdp_packets; u64_stats_t xdp_tx; u64_stats_t xdp_redirects; u64_stats_t xdp_drops; u64_stats_t kicks; }; #define VIRTNET_SQ_STAT(name, m) {name, offsetof(struct virtnet_sq_stats, m), -1} #define VIRTNET_RQ_STAT(name, m) {name, offsetof(struct virtnet_rq_stats, m), -1} #define VIRTNET_SQ_STAT_QSTAT(name, m) \ { \ name, \ offsetof(struct virtnet_sq_stats, m), \ offsetof(struct netdev_queue_stats_tx, m), \ } #define VIRTNET_RQ_STAT_QSTAT(name, m) \ { \ name, \ offsetof(struct virtnet_rq_stats, m), \ offsetof(struct netdev_queue_stats_rx, m), \ } static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = { VIRTNET_SQ_STAT("xdp_tx", xdp_tx), VIRTNET_SQ_STAT("xdp_tx_drops", xdp_tx_drops), VIRTNET_SQ_STAT("kicks", kicks), VIRTNET_SQ_STAT("tx_timeouts", tx_timeouts), }; static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = { VIRTNET_RQ_STAT("drops", drops), VIRTNET_RQ_STAT("xdp_packets", xdp_packets), VIRTNET_RQ_STAT("xdp_tx", xdp_tx), VIRTNET_RQ_STAT("xdp_redirects", xdp_redirects), VIRTNET_RQ_STAT("xdp_drops", xdp_drops), VIRTNET_RQ_STAT("kicks", kicks), }; static const struct virtnet_stat_desc virtnet_sq_stats_desc_qstat[] = { VIRTNET_SQ_STAT_QSTAT("packets", packets), VIRTNET_SQ_STAT_QSTAT("bytes", bytes), VIRTNET_SQ_STAT_QSTAT("stop", stop), VIRTNET_SQ_STAT_QSTAT("wake", wake), }; static const struct virtnet_stat_desc virtnet_rq_stats_desc_qstat[] = { VIRTNET_RQ_STAT_QSTAT("packets", packets), VIRTNET_RQ_STAT_QSTAT("bytes", bytes), }; #define VIRTNET_STATS_DESC_CQ(name) \ {#name, offsetof(struct virtio_net_stats_cvq, name), -1} #define VIRTNET_STATS_DESC_RX(class, name) \ {#name, offsetof(struct virtio_net_stats_rx_ ## class, rx_ ## name), -1} #define VIRTNET_STATS_DESC_TX(class, name) \ {#name, offsetof(struct virtio_net_stats_tx_ ## class, tx_ ## name), -1} static const struct virtnet_stat_desc virtnet_stats_cvq_desc[] = { VIRTNET_STATS_DESC_CQ(command_num), VIRTNET_STATS_DESC_CQ(ok_num), }; static const struct virtnet_stat_desc virtnet_stats_rx_basic_desc[] = { VIRTNET_STATS_DESC_RX(basic, packets), VIRTNET_STATS_DESC_RX(basic, bytes), VIRTNET_STATS_DESC_RX(basic, notifications), VIRTNET_STATS_DESC_RX(basic, interrupts), }; static const struct virtnet_stat_desc virtnet_stats_tx_basic_desc[] = { VIRTNET_STATS_DESC_TX(basic, packets), VIRTNET_STATS_DESC_TX(basic, bytes), VIRTNET_STATS_DESC_TX(basic, notifications), VIRTNET_STATS_DESC_TX(basic, interrupts), }; static const struct virtnet_stat_desc virtnet_stats_rx_csum_desc[] = { VIRTNET_STATS_DESC_RX(csum, needs_csum), }; static const struct virtnet_stat_desc virtnet_stats_tx_gso_desc[] = { VIRTNET_STATS_DESC_TX(gso, gso_packets_noseg), VIRTNET_STATS_DESC_TX(gso, gso_bytes_noseg), }; static const struct virtnet_stat_desc virtnet_stats_rx_speed_desc[] = { VIRTNET_STATS_DESC_RX(speed, ratelimit_bytes), }; static const struct virtnet_stat_desc virtnet_stats_tx_speed_desc[] = { VIRTNET_STATS_DESC_TX(speed, ratelimit_bytes), }; #define VIRTNET_STATS_DESC_RX_QSTAT(class, name, qstat_field) \ { \ #name, \ offsetof(struct virtio_net_stats_rx_ ## class, rx_ ## name), \ offsetof(struct netdev_queue_stats_rx, qstat_field), \ } #define VIRTNET_STATS_DESC_TX_QSTAT(class, name, qstat_field) \ { \ #name, \ offsetof(struct virtio_net_stats_tx_ ## class, tx_ ## name), \ offsetof(struct netdev_queue_stats_tx, qstat_field), \ } static const struct virtnet_stat_desc virtnet_stats_rx_basic_desc_qstat[] = { VIRTNET_STATS_DESC_RX_QSTAT(basic, drops, hw_drops), VIRTNET_STATS_DESC_RX_QSTAT(basic, drop_overruns, hw_drop_overruns), }; static const struct virtnet_stat_desc virtnet_stats_tx_basic_desc_qstat[] = { VIRTNET_STATS_DESC_TX_QSTAT(basic, drops, hw_drops), VIRTNET_STATS_DESC_TX_QSTAT(basic, drop_malformed, hw_drop_errors), }; static const struct virtnet_stat_desc virtnet_stats_rx_csum_desc_qstat[] = { VIRTNET_STATS_DESC_RX_QSTAT(csum, csum_valid, csum_unnecessary), VIRTNET_STATS_DESC_RX_QSTAT(csum, csum_none, csum_none), VIRTNET_STATS_DESC_RX_QSTAT(csum, csum_bad, csum_bad), }; static const struct virtnet_stat_desc virtnet_stats_tx_csum_desc_qstat[] = { VIRTNET_STATS_DESC_TX_QSTAT(csum, csum_none, csum_none), VIRTNET_STATS_DESC_TX_QSTAT(csum, needs_csum, needs_csum), }; static const struct virtnet_stat_desc virtnet_stats_rx_gso_desc_qstat[] = { VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_packets, hw_gro_packets), VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_bytes, hw_gro_bytes), VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_packets_coalesced, hw_gro_wire_packets), VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_bytes_coalesced, hw_gro_wire_bytes), }; static const struct virtnet_stat_desc virtnet_stats_tx_gso_desc_qstat[] = { VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_packets, hw_gso_packets), VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_bytes, hw_gso_bytes), VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_segments, hw_gso_wire_packets), VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_segments_bytes, hw_gso_wire_bytes), }; static const struct virtnet_stat_desc virtnet_stats_rx_speed_desc_qstat[] = { VIRTNET_STATS_DESC_RX_QSTAT(speed, ratelimit_packets, hw_drop_ratelimits), }; static const struct virtnet_stat_desc virtnet_stats_tx_speed_desc_qstat[] = { VIRTNET_STATS_DESC_TX_QSTAT(speed, ratelimit_packets, hw_drop_ratelimits), }; #define VIRTNET_Q_TYPE_RX 0 #define VIRTNET_Q_TYPE_TX 1 #define VIRTNET_Q_TYPE_CQ 2 struct virtnet_interrupt_coalesce { u32 max_packets; u32 max_usecs; }; /* The dma information of pages allocated at a time. */ struct virtnet_rq_dma { dma_addr_t addr; u32 ref; u16 len; u16 need_sync; }; /* Internal representation of a send virtqueue */ struct send_queue { /* Virtqueue associated with this send _queue */ struct virtqueue *vq; /* TX: fragments + linear part + virtio header */ struct scatterlist sg[MAX_SKB_FRAGS + 2]; /* Name of the send queue: output.$index */ char name[16]; struct virtnet_sq_stats stats; struct virtnet_interrupt_coalesce intr_coal; struct napi_struct napi; /* Record whether sq is in reset state. */ bool reset; struct xsk_buff_pool *xsk_pool; dma_addr_t xsk_hdr_dma_addr; }; /* Internal representation of a receive virtqueue */ struct receive_queue { /* Virtqueue associated with this receive_queue */ struct virtqueue *vq; struct napi_struct napi; struct bpf_prog __rcu *xdp_prog; struct virtnet_rq_stats stats; /* The number of rx notifications */ u16 calls; /* Is dynamic interrupt moderation enabled? */ bool dim_enabled; /* Used to protect dim_enabled and inter_coal */ struct mutex dim_lock; /* Dynamic Interrupt Moderation */ struct dim dim; u32 packets_in_napi; struct virtnet_interrupt_coalesce intr_coal; /* Chain pages by the private ptr. */ struct page *pages; /* Average packet length for mergeable receive buffers. */ struct ewma_pkt_len mrg_avg_pkt_len; /* Page frag for packet buffer allocation. */ struct page_frag alloc_frag; /* RX: fragments + linear part + virtio header */ struct scatterlist sg[MAX_SKB_FRAGS + 2]; /* Min single buffer size for mergeable buffers case. */ unsigned int min_buf_len; /* Name of this receive queue: input.$index */ char name[16]; struct xdp_rxq_info xdp_rxq; /* Record the last dma info to free after new pages is allocated. */ struct virtnet_rq_dma *last_dma; struct xsk_buff_pool *xsk_pool; /* xdp rxq used by xsk */ struct xdp_rxq_info xsk_rxq_info; struct xdp_buff **xsk_buffs; }; #define VIRTIO_NET_RSS_MAX_KEY_SIZE 40 /* Control VQ buffers: protected by the rtnl lock */ struct control_buf { struct virtio_net_ctrl_hdr hdr; virtio_net_ctrl_ack status; }; struct virtnet_info { struct virtio_device *vdev; struct virtqueue *cvq; struct net_device *dev; struct send_queue *sq; struct receive_queue *rq; unsigned int status; /* Max # of queue pairs supported by the device */ u16 max_queue_pairs; /* # of queue pairs currently used by the driver */ u16 curr_queue_pairs; /* # of XDP queue pairs currently used by the driver */ u16 xdp_queue_pairs; /* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */ bool xdp_enabled; /* I like... big packets and I cannot lie! */ bool big_packets; /* number of sg entries allocated for big packets */ unsigned int big_packets_num_skbfrags; /* Host will merge rx buffers for big packets (shake it! shake it!) */ bool mergeable_rx_bufs; /* Host supports rss and/or hash report */ bool has_rss; bool has_rss_hash_report; u8 rss_key_size; u16 rss_indir_table_size; u32 rss_hash_types_supported; u32 rss_hash_types_saved; struct virtio_net_rss_config_hdr *rss_hdr; struct virtio_net_rss_config_trailer rss_trailer; u8 rss_hash_key_data[VIRTIO_NET_RSS_MAX_KEY_SIZE]; /* Has control virtqueue */ bool has_cvq; /* Lock to protect the control VQ */ struct mutex cvq_lock; /* Host can handle any s/g split between our header and packet data */ bool any_header_sg; /* Packet virtio header size */ u8 hdr_len; /* Work struct for delayed refilling if we run low on memory. */ struct delayed_work refill; /* Is delayed refill enabled? */ bool refill_enabled; /* The lock to synchronize the access to refill_enabled */ spinlock_t refill_lock; /* Work struct for config space updates */ struct work_struct config_work; /* Work struct for setting rx mode */ struct work_struct rx_mode_work; /* OK to queue work setting RX mode? */ bool rx_mode_work_enabled; /* Does the affinity hint is set for virtqueues? */ bool affinity_hint_set; /* CPU hotplug instances for online & dead */ struct hlist_node node; struct hlist_node node_dead; struct control_buf *ctrl; /* Ethtool settings */ u8 duplex; u32 speed; /* Is rx dynamic interrupt moderation enabled? */ bool rx_dim_enabled; /* Interrupt coalescing settings */ struct virtnet_interrupt_coalesce intr_coal_tx; struct virtnet_interrupt_coalesce intr_coal_rx; unsigned long guest_offloads; unsigned long guest_offloads_capable; /* failover when STANDBY feature enabled */ struct failover *failover; u64 device_stats_cap; }; struct padded_vnet_hdr { struct virtio_net_hdr_v1_hash hdr; /* * hdr is in a separate sg buffer, and data sg buffer shares same page * with this header sg. This padding makes next sg 16 byte aligned * after the header. */ char padding[12]; }; struct virtio_net_common_hdr { union { struct virtio_net_hdr hdr; struct virtio_net_hdr_mrg_rxbuf mrg_hdr; struct virtio_net_hdr_v1_hash hash_v1_hdr; }; }; static struct virtio_net_common_hdr xsk_hdr; static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); static void virtnet_sq_free_unused_buf_done(struct virtqueue *vq); static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, struct net_device *dev, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats); static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, struct sk_buff *skb, u8 flags); static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, struct sk_buff *curr_skb, struct page *page, void *buf, int len, int truesize); static void virtnet_xsk_completed(struct send_queue *sq, int num); enum virtnet_xmit_type { VIRTNET_XMIT_TYPE_SKB, VIRTNET_XMIT_TYPE_SKB_ORPHAN, VIRTNET_XMIT_TYPE_XDP, VIRTNET_XMIT_TYPE_XSK, }; static size_t virtnet_rss_hdr_size(const struct virtnet_info *vi) { u16 indir_table_size = vi->has_rss ? vi->rss_indir_table_size : 1; return struct_size(vi->rss_hdr, indirection_table, indir_table_size); } static size_t virtnet_rss_trailer_size(const struct virtnet_info *vi) { return struct_size(&vi->rss_trailer, hash_key_data, vi->rss_key_size); } /* We use the last two bits of the pointer to distinguish the xmit type. */ #define VIRTNET_XMIT_TYPE_MASK (BIT(0) | BIT(1)) #define VIRTIO_XSK_FLAG_OFFSET 2 static enum virtnet_xmit_type virtnet_xmit_ptr_unpack(void **ptr) { unsigned long p = (unsigned long)*ptr; *ptr = (void *)(p & ~VIRTNET_XMIT_TYPE_MASK); return p & VIRTNET_XMIT_TYPE_MASK; } static void *virtnet_xmit_ptr_pack(void *ptr, enum virtnet_xmit_type type) { return (void *)((unsigned long)ptr | type); } static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data, enum virtnet_xmit_type type) { return virtqueue_add_outbuf(sq->vq, sq->sg, num, virtnet_xmit_ptr_pack(data, type), GFP_ATOMIC); } static u32 virtnet_ptr_to_xsk_buff_len(void *ptr) { return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET; } static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) { sg_dma_address(sg) = addr; sg_dma_len(sg) = len; } static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, bool in_napi, struct virtnet_sq_free_stats *stats) { struct xdp_frame *frame; struct sk_buff *skb; unsigned int len; void *ptr; while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { switch (virtnet_xmit_ptr_unpack(&ptr)) { case VIRTNET_XMIT_TYPE_SKB: skb = ptr; pr_debug("Sent skb %p\n", skb); stats->napi_packets++; stats->napi_bytes += skb->len; napi_consume_skb(skb, in_napi); break; case VIRTNET_XMIT_TYPE_SKB_ORPHAN: skb = ptr; stats->packets++; stats->bytes += skb->len; napi_consume_skb(skb, in_napi); break; case VIRTNET_XMIT_TYPE_XDP: frame = ptr; stats->packets++; stats->bytes += xdp_get_frame_len(frame); xdp_return_frame(frame); break; case VIRTNET_XMIT_TYPE_XSK: stats->bytes += virtnet_ptr_to_xsk_buff_len(ptr); stats->xsk++; break; } } netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); } static void virtnet_free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, bool in_napi, struct virtnet_sq_free_stats *stats) { __free_old_xmit(sq, txq, in_napi, stats); if (stats->xsk) virtnet_xsk_completed(sq, stats->xsk); } /* Converting between virtqueue no. and kernel tx/rx queue no. * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq */ static int vq2txq(struct virtqueue *vq) { return (vq->index - 1) / 2; } static int txq2vq(int txq) { return txq * 2 + 1; } static int vq2rxq(struct virtqueue *vq) { return vq->index / 2; } static int rxq2vq(int rxq) { return rxq * 2; } static int vq_type(struct virtnet_info *vi, int qid) { if (qid == vi->max_queue_pairs * 2) return VIRTNET_Q_TYPE_CQ; if (qid % 2) return VIRTNET_Q_TYPE_TX; return VIRTNET_Q_TYPE_RX; } static inline struct virtio_net_common_hdr * skb_vnet_common_hdr(struct sk_buff *skb) { return (struct virtio_net_common_hdr *)skb->cb; } /* * private is used to chain pages for big packets, put the whole * most recent used list in the beginning for reuse */ static void give_pages(struct receive_queue *rq, struct page *page) { struct page *end; /* Find end of list, sew whole thing into vi->rq.pages. */ for (end = page; end->private; end = (struct page *)end->private); end->private = (unsigned long)rq->pages; rq->pages = page; } static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask) { struct page *p = rq->pages; if (p) { rq->pages = (struct page *)p->private; /* clear private here, it is used to chain pages */ p->private = 0; } else p = alloc_page(gfp_mask); return p; } static void virtnet_rq_free_buf(struct virtnet_info *vi, struct receive_queue *rq, void *buf) { if (vi->mergeable_rx_bufs) put_page(virt_to_head_page(buf)); else if (vi->big_packets) give_pages(rq, buf); else put_page(virt_to_head_page(buf)); } static void enable_delayed_refill(struct virtnet_info *vi) { spin_lock_bh(&vi->refill_lock); vi->refill_enabled = true; spin_unlock_bh(&vi->refill_lock); } static void disable_delayed_refill(struct virtnet_info *vi) { spin_lock_bh(&vi->refill_lock); vi->refill_enabled = false; spin_unlock_bh(&vi->refill_lock); } static void enable_rx_mode_work(struct virtnet_info *vi) { rtnl_lock(); vi->rx_mode_work_enabled = true; rtnl_unlock(); } static void disable_rx_mode_work(struct virtnet_info *vi) { rtnl_lock(); vi->rx_mode_work_enabled = false; rtnl_unlock(); } static void virtqueue_napi_schedule(struct napi_struct *napi, struct virtqueue *vq) { if (napi_schedule_prep(napi)) { virtqueue_disable_cb(vq); __napi_schedule(napi); } } static bool virtqueue_napi_complete(struct napi_struct *napi, struct virtqueue *vq, int processed) { int opaque; opaque = virtqueue_enable_cb_prepare(vq); if (napi_complete_done(napi, processed)) { if (unlikely(virtqueue_poll(vq, opaque))) virtqueue_napi_schedule(napi, vq); else return true; } else { virtqueue_disable_cb(vq); } return false; } static void skb_xmit_done(struct virtqueue *vq) { struct virtnet_info *vi = vq->vdev->priv; struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi; /* Suppress further interrupts. */ virtqueue_disable_cb(vq); if (napi->weight) virtqueue_napi_schedule(napi, vq); else /* We were probably waiting for more output buffers. */ netif_wake_subqueue(vi->dev, vq2txq(vq)); } #define MRG_CTX_HEADER_SHIFT 22 static void *mergeable_len_to_ctx(unsigned int truesize, unsigned int headroom) { return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize); } static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx) { return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT; } static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx) { return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1); } static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, unsigned int headroom, unsigned int len) { struct sk_buff *skb; skb = build_skb(buf, buflen); if (unlikely(!skb)) return NULL; skb_reserve(skb, headroom); skb_put(skb, len); return skb; } /* Called from bottom half context */ static struct sk_buff *page_to_skb(struct virtnet_info *vi, struct receive_queue *rq, struct page *page, unsigned int offset, unsigned int len, unsigned int truesize, unsigned int headroom) { struct sk_buff *skb; struct virtio_net_common_hdr *hdr; unsigned int copy, hdr_len, hdr_padded_len; struct page *page_to_free = NULL; int tailroom, shinfo_size; char *p, *hdr_p, *buf; p = page_address(page) + offset; hdr_p = p; hdr_len = vi->hdr_len; if (vi->mergeable_rx_bufs) hdr_padded_len = hdr_len; else hdr_padded_len = sizeof(struct padded_vnet_hdr); buf = p - headroom; len -= hdr_len; offset += hdr_padded_len; p += hdr_padded_len; tailroom = truesize - headroom - hdr_padded_len - len; shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) { skb = virtnet_build_skb(buf, truesize, p - buf, len); if (unlikely(!skb)) return NULL; page = (struct page *)page->private; if (page) give_pages(rq, page); goto ok; } /* copy small packet so we can reuse these pages for small data */ skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN); if (unlikely(!skb)) return NULL; /* Copy all frame if it fits skb->head, otherwise * we let virtio_net_hdr_to_skb() and GRO pull headers as needed. */ if (len <= skb_tailroom(skb)) copy = len; else copy = ETH_HLEN; skb_put_data(skb, p, copy); len -= copy; offset += copy; if (vi->mergeable_rx_bufs) { if (len) skb_add_rx_frag(skb, 0, page, offset, len, truesize); else page_to_free = page; goto ok; } /* * Verify that we can indeed put this data into a skb. * This is here to handle cases when the device erroneously * tries to receive more than is possible. This is usually * the case of a broken device. */ if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) { net_dbg_ratelimited("%s: too much data\n", skb->dev->name); dev_kfree_skb(skb); return NULL; } BUG_ON(offset >= PAGE_SIZE); while (len) { unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len); skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset, frag_size, truesize); len -= frag_size; page = (struct page *)page->private; offset = 0; } if (page) give_pages(rq, page); ok: hdr = skb_vnet_common_hdr(skb); memcpy(hdr, hdr_p, hdr_len); if (page_to_free) put_page(page_to_free); return skb; } static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len) { struct virtnet_info *vi = rq->vq->vdev->priv; struct page *page = virt_to_head_page(buf); struct virtnet_rq_dma *dma; void *head; int offset; BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); head = page_address(page); dma = head; --dma->ref; if (dma->need_sync && len) { offset = buf - (head + sizeof(*dma)); virtqueue_dma_sync_single_range_for_cpu(rq->vq, dma->addr, offset, len, DMA_FROM_DEVICE); } if (dma->ref) return; virtqueue_dma_unmap_single_attrs(rq->vq, dma->addr, dma->len, DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); put_page(page); } static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx) { struct virtnet_info *vi = rq->vq->vdev->priv; void *buf; BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); buf = virtqueue_get_buf_ctx(rq->vq, len, ctx); if (buf) virtnet_rq_unmap(rq, buf, *len); return buf; } static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len) { struct virtnet_info *vi = rq->vq->vdev->priv; struct virtnet_rq_dma *dma; dma_addr_t addr; u32 offset; void *head; BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); head = page_address(rq->alloc_frag.page); offset = buf - head; dma = head; addr = dma->addr - sizeof(*dma) + offset; sg_init_table(rq->sg, 1); sg_fill_dma(rq->sg, addr, len); } static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp) { struct page_frag *alloc_frag = &rq->alloc_frag; struct virtnet_info *vi = rq->vq->vdev->priv; struct virtnet_rq_dma *dma; void *buf, *head; dma_addr_t addr; BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); head = page_address(alloc_frag->page); dma = head; /* new pages */ if (!alloc_frag->offset) { if (rq->last_dma) { /* Now, the new page is allocated, the last dma * will not be used. So the dma can be unmapped * if the ref is 0. */ virtnet_rq_unmap(rq, rq->last_dma, 0); rq->last_dma = NULL; } dma->len = alloc_frag->size - sizeof(*dma); addr = virtqueue_dma_map_single_attrs(rq->vq, dma + 1, dma->len, DMA_FROM_DEVICE, 0); if (virtqueue_dma_mapping_error(rq->vq, addr)) return NULL; dma->addr = addr; dma->need_sync = virtqueue_dma_need_sync(rq->vq, addr); /* Add a reference to dma to prevent the entire dma from * being released during error handling. This reference * will be freed after the pages are no longer used. */ get_page(alloc_frag->page); dma->ref = 1; alloc_frag->offset = sizeof(*dma); rq->last_dma = dma; } ++dma->ref; buf = head + alloc_frag->offset; get_page(alloc_frag->page); alloc_frag->offset += size; return buf; } static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) { struct virtnet_info *vi = vq->vdev->priv; struct receive_queue *rq; int i = vq2rxq(vq); rq = &vi->rq[i]; if (rq->xsk_pool) { xsk_buff_free((struct xdp_buff *)buf); return; } if (!vi->big_packets || vi->mergeable_rx_bufs) virtnet_rq_unmap(rq, buf, 0); virtnet_rq_free_buf(vi, rq, buf); } static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, bool in_napi) { struct virtnet_sq_free_stats stats = {0}; virtnet_free_old_xmit(sq, txq, in_napi, &stats); /* Avoid overhead when no packets have been processed * happens when called speculatively from start_xmit. */ if (!stats.packets && !stats.napi_packets) return; u64_stats_update_begin(&sq->stats.syncp); u64_stats_add(&sq->stats.bytes, stats.bytes + stats.napi_bytes); u64_stats_add(&sq->stats.packets, stats.packets + stats.napi_packets); u64_stats_update_end(&sq->stats.syncp); } static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) { if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs)) return false; else if (q < vi->curr_queue_pairs) return true; else return false; } static bool tx_may_stop(struct virtnet_info *vi, struct net_device *dev, struct send_queue *sq) { int qnum; qnum = sq - vi->sq; /* If running out of space, stop queue to avoid getting packets that we * are then unable to transmit. * An alternative would be to force queuing layer to requeue the skb by * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be * returned in a normal path of operation: it means that driver is not * maintaining the TX queue stop/start state properly, and causes * the stack to do a non-trivial amount of useless work. * Since most packets only take 1 or 2 ring slots, stopping the queue * early means 16 slots are typically wasted. */ if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); netif_tx_stop_queue(txq); u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.stop); u64_stats_update_end(&sq->stats.syncp); return true; } return false; } static void check_sq_full_and_disable(struct virtnet_info *vi, struct net_device *dev, struct send_queue *sq) { bool use_napi = sq->napi.weight; int qnum; qnum = sq - vi->sq; if (tx_may_stop(vi, dev, sq)) { struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); if (use_napi) { if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) virtqueue_napi_schedule(&sq->napi, sq->vq); } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { /* More just got used, free them then recheck. */ free_old_xmit(sq, txq, false); if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { netif_start_subqueue(dev, qnum); u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.wake); u64_stats_update_end(&sq->stats.syncp); virtqueue_disable_cb(sq->vq); } } } } static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, struct receive_queue *rq, void *buf, u32 len) { struct xdp_buff *xdp; u32 bufsize; xdp = (struct xdp_buff *)buf; bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len; if (unlikely(len > bufsize)) { pr_debug("%s: rx error: len %u exceeds truesize %u\n", vi->dev->name, len, bufsize); DEV_STATS_INC(vi->dev, rx_length_errors); xsk_buff_free(xdp); return NULL; } xsk_buff_set_size(xdp, len); xsk_buff_dma_sync_for_cpu(xdp); return xdp; } static struct sk_buff *xsk_construct_skb(struct receive_queue *rq, struct xdp_buff *xdp) { unsigned int metasize = xdp->data - xdp->data_meta; struct sk_buff *skb; unsigned int size; size = xdp->data_end - xdp->data_hard_start; skb = napi_alloc_skb(&rq->napi, size); if (unlikely(!skb)) { xsk_buff_free(xdp); return NULL; } skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); size = xdp->data_end - xdp->data_meta; memcpy(__skb_put(skb, size), xdp->data_meta, size); if (metasize) { __skb_pull(skb, metasize); skb_metadata_set(skb, metasize); } xsk_buff_free(xdp); return skb; } static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, struct xdp_buff *xdp, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct bpf_prog *prog; u32 ret; ret = XDP_PASS; rcu_read_lock(); prog = rcu_dereference(rq->xdp_prog); if (prog) ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); rcu_read_unlock(); switch (ret) { case XDP_PASS: return xsk_construct_skb(rq, xdp); case XDP_TX: case XDP_REDIRECT: return NULL; default: /* drop packet */ xsk_buff_free(xdp); u64_stats_inc(&stats->drops); return NULL; } } static void xsk_drop_follow_bufs(struct net_device *dev, struct receive_queue *rq, u32 num_buf, struct virtnet_rq_stats *stats) { struct xdp_buff *xdp; u32 len; while (num_buf-- > 1) { xdp = virtqueue_get_buf(rq->vq, &len); if (unlikely(!xdp)) { pr_debug("%s: rx error: %d buffers missing\n", dev->name, num_buf); DEV_STATS_INC(dev, rx_length_errors); break; } u64_stats_add(&stats->bytes, len); xsk_buff_free(xdp); } } static int xsk_append_merge_buffer(struct virtnet_info *vi, struct receive_queue *rq, struct sk_buff *head_skb, u32 num_buf, struct virtio_net_hdr_mrg_rxbuf *hdr, struct virtnet_rq_stats *stats) { struct sk_buff *curr_skb; struct xdp_buff *xdp; u32 len, truesize; struct page *page; void *buf; curr_skb = head_skb; while (--num_buf) { buf = virtqueue_get_buf(rq->vq, &len); if (unlikely(!buf)) { pr_debug("%s: rx error: %d buffers out of %d missing\n", vi->dev->name, num_buf, virtio16_to_cpu(vi->vdev, hdr->num_buffers)); DEV_STATS_INC(vi->dev, rx_length_errors); return -EINVAL; } u64_stats_add(&stats->bytes, len); xdp = buf_to_xdp(vi, rq, buf, len); if (!xdp) goto err; buf = napi_alloc_frag(len); if (!buf) { xsk_buff_free(xdp); goto err; } memcpy(buf, xdp->data - vi->hdr_len, len); xsk_buff_free(xdp); page = virt_to_page(buf); truesize = len; curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page, buf, len, truesize); if (!curr_skb) { put_page(page); goto err; } } return 0; err: xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats); return -EINVAL; } static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, struct xdp_buff *xdp, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct virtio_net_hdr_mrg_rxbuf *hdr; struct bpf_prog *prog; struct sk_buff *skb; u32 ret, num_buf; hdr = xdp->data - vi->hdr_len; num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); ret = XDP_PASS; rcu_read_lock(); prog = rcu_dereference(rq->xdp_prog); /* TODO: support multi buffer. */ if (prog && num_buf == 1) ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); rcu_read_unlock(); switch (ret) { case XDP_PASS: skb = xsk_construct_skb(rq, xdp); if (!skb) goto drop_bufs; if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) { dev_kfree_skb(skb); goto drop; } return skb; case XDP_TX: case XDP_REDIRECT: return NULL; default: /* drop packet */ xsk_buff_free(xdp); } drop_bufs: xsk_drop_follow_bufs(dev, rq, num_buf, stats); drop: u64_stats_inc(&stats->drops); return NULL; } static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq, void *buf, u32 len, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct net_device *dev = vi->dev; struct sk_buff *skb = NULL; struct xdp_buff *xdp; u8 flags; len -= vi->hdr_len; u64_stats_add(&stats->bytes, len); xdp = buf_to_xdp(vi, rq, buf, len); if (!xdp) return; if (unlikely(len < ETH_HLEN)) { pr_debug("%s: short packet %i\n", dev->name, len); DEV_STATS_INC(dev, rx_length_errors); xsk_buff_free(xdp); return; } flags = ((struct virtio_net_common_hdr *)(xdp->data - vi->hdr_len))->hdr.flags; if (!vi->mergeable_rx_bufs) skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats); else skb = virtnet_receive_xsk_merge(dev, vi, rq, xdp, xdp_xmit, stats); if (skb) virtnet_receive_done(vi, rq, skb, flags); } static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue *rq, struct xsk_buff_pool *pool, gfp_t gfp) { struct xdp_buff **xsk_buffs; dma_addr_t addr; int err = 0; u32 len, i; int num; xsk_buffs = rq->xsk_buffs; num = xsk_buff_alloc_batch(pool, xsk_buffs, rq->vq->num_free); if (!num) return -ENOMEM; len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len; for (i = 0; i < num; ++i) { /* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space. * We assume XDP_PACKET_HEADROOM is larger than hdr->len. * (see function virtnet_xsk_pool_enable) */ addr = xsk_buff_xdp_get_dma(xsk_buffs[i]) - vi->hdr_len; sg_init_table(rq->sg, 1); sg_fill_dma(rq->sg, addr, len); err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, xsk_buffs[i], NULL, gfp); if (err) goto err; } return num; err: for (; i < num; ++i) xsk_buff_free(xsk_buffs[i]); return err; } static void *virtnet_xsk_to_ptr(u32 len) { unsigned long p; p = len << VIRTIO_XSK_FLAG_OFFSET; return virtnet_xmit_ptr_pack((void *)p, VIRTNET_XMIT_TYPE_XSK); } static int virtnet_xsk_xmit_one(struct send_queue *sq, struct xsk_buff_pool *pool, struct xdp_desc *desc) { struct virtnet_info *vi; dma_addr_t addr; vi = sq->vq->vdev->priv; addr = xsk_buff_raw_get_dma(pool, desc->addr); xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len); sg_init_table(sq->sg, 2); sg_fill_dma(sq->sg, sq->xsk_hdr_dma_addr, vi->hdr_len); sg_fill_dma(sq->sg + 1, addr, desc->len); return virtqueue_add_outbuf_premapped(sq->vq, sq->sg, 2, virtnet_xsk_to_ptr(desc->len), GFP_ATOMIC); } static int virtnet_xsk_xmit_batch(struct send_queue *sq, struct xsk_buff_pool *pool, unsigned int budget, u64 *kicks) { struct xdp_desc *descs = pool->tx_descs; bool kick = false; u32 nb_pkts, i; int err; budget = min_t(u32, budget, sq->vq->num_free); nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); if (!nb_pkts) return 0; for (i = 0; i < nb_pkts; i++) { err = virtnet_xsk_xmit_one(sq, pool, &descs[i]); if (unlikely(err)) { xsk_tx_completed(sq->xsk_pool, nb_pkts - i); break; } kick = true; } if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) (*kicks)++; return i; } static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool, int budget) { struct virtnet_info *vi = sq->vq->vdev->priv; struct virtnet_sq_free_stats stats = {}; struct net_device *dev = vi->dev; u64 kicks = 0; int sent; /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of * free_old_xmit(). */ __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats); if (stats.xsk) xsk_tx_completed(sq->xsk_pool, stats.xsk); sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks); if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq)) check_sq_full_and_disable(vi, vi->dev, sq); if (sent) { struct netdev_queue *txq; txq = netdev_get_tx_queue(vi->dev, sq - vi->sq); txq_trans_cond_update(txq); } u64_stats_update_begin(&sq->stats.syncp); u64_stats_add(&sq->stats.packets, stats.packets); u64_stats_add(&sq->stats.bytes, stats.bytes); u64_stats_add(&sq->stats.kicks, kicks); u64_stats_add(&sq->stats.xdp_tx, sent); u64_stats_update_end(&sq->stats.syncp); if (xsk_uses_need_wakeup(pool)) xsk_set_tx_need_wakeup(pool); return sent; } static void xsk_wakeup(struct send_queue *sq) { if (napi_if_scheduled_mark_missed(&sq->napi)) return; local_bh_disable(); virtqueue_napi_schedule(&sq->napi, sq->vq); local_bh_enable(); } static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag) { struct virtnet_info *vi = netdev_priv(dev); struct send_queue *sq; if (!netif_running(dev)) return -ENETDOWN; if (qid >= vi->curr_queue_pairs) return -EINVAL; sq = &vi->sq[qid]; xsk_wakeup(sq); return 0; } static void virtnet_xsk_completed(struct send_queue *sq, int num) { xsk_tx_completed(sq->xsk_pool, num); /* If this is called by rx poll, start_xmit and xdp xmit we should * wakeup the tx napi to consume the xsk tx queue, because the tx * interrupt may not be triggered. */ xsk_wakeup(sq); } static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, struct send_queue *sq, struct xdp_frame *xdpf) { struct virtio_net_hdr_mrg_rxbuf *hdr; struct skb_shared_info *shinfo; u8 nr_frags = 0; int err, i; if (unlikely(xdpf->headroom < vi->hdr_len)) return -EOVERFLOW; if (unlikely(xdp_frame_has_frags(xdpf))) { shinfo = xdp_get_shared_info_from_frame(xdpf); nr_frags = shinfo->nr_frags; } /* In wrapping function virtnet_xdp_xmit(), we need to free * up the pending old buffers, where we need to calculate the * position of skb_shared_info in xdp_get_frame_len() and * xdp_return_frame(), which will involve to xdpf->data and * xdpf->headroom. Therefore, we need to update the value of * headroom synchronously here. */ xdpf->headroom -= vi->hdr_len; xdpf->data -= vi->hdr_len; /* Zero header and leave csum up to XDP layers */ hdr = xdpf->data; memset(hdr, 0, vi->hdr_len); xdpf->len += vi->hdr_len; sg_init_table(sq->sg, nr_frags + 1); sg_set_buf(sq->sg, xdpf->data, xdpf->len); for (i = 0; i < nr_frags; i++) { skb_frag_t *frag = &shinfo->frags[i]; sg_set_page(&sq->sg[i + 1], skb_frag_page(frag), skb_frag_size(frag), skb_frag_off(frag)); } err = virtnet_add_outbuf(sq, nr_frags + 1, xdpf, VIRTNET_XMIT_TYPE_XDP); if (unlikely(err)) return -ENOSPC; /* Caller handle free/refcnt */ return 0; } /* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on * the current cpu, so it does not need to be locked. * * Here we use marco instead of inline functions because we have to deal with * three issues at the same time: 1. the choice of sq. 2. judge and execute the * lock/unlock of txq 3. make sparse happy. It is difficult for two inline * functions to perfectly solve these three problems at the same time. */ #define virtnet_xdp_get_sq(vi) ({ \ int cpu = smp_processor_id(); \ struct netdev_queue *txq; \ typeof(vi) v = (vi); \ unsigned int qp; \ \ if (v->curr_queue_pairs > nr_cpu_ids) { \ qp = v->curr_queue_pairs - v->xdp_queue_pairs; \ qp += cpu; \ txq = netdev_get_tx_queue(v->dev, qp); \ __netif_tx_acquire(txq); \ } else { \ qp = cpu % v->curr_queue_pairs; \ txq = netdev_get_tx_queue(v->dev, qp); \ __netif_tx_lock(txq, cpu); \ } \ v->sq + qp; \ }) #define virtnet_xdp_put_sq(vi, q) { \ struct netdev_queue *txq; \ typeof(vi) v = (vi); \ \ txq = netdev_get_tx_queue(v->dev, (q) - v->sq); \ if (v->curr_queue_pairs > nr_cpu_ids) \ __netif_tx_release(txq); \ else \ __netif_tx_unlock(txq); \ } static int virtnet_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct virtnet_info *vi = netdev_priv(dev); struct virtnet_sq_free_stats stats = {0}; struct receive_queue *rq = vi->rq; struct bpf_prog *xdp_prog; struct send_queue *sq; int nxmit = 0; int kicks = 0; int ret; int i; /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this * indicate XDP resources have been successfully allocated. */ xdp_prog = rcu_access_pointer(rq->xdp_prog); if (!xdp_prog) return -ENXIO; sq = virtnet_xdp_get_sq(vi); if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) { ret = -EINVAL; goto out; } /* Free up any pending old buffers before queueing new ones. */ virtnet_free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), false, &stats); for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; if (__virtnet_xdp_xmit_one(vi, sq, xdpf)) break; nxmit++; } ret = nxmit; if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq)) check_sq_full_and_disable(vi, dev, sq); if (flags & XDP_XMIT_FLUSH) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) kicks = 1; } out: u64_stats_update_begin(&sq->stats.syncp); u64_stats_add(&sq->stats.bytes, stats.bytes); u64_stats_add(&sq->stats.packets, stats.packets); u64_stats_add(&sq->stats.xdp_tx, n); u64_stats_add(&sq->stats.xdp_tx_drops, n - nxmit); u64_stats_add(&sq->stats.kicks, kicks); u64_stats_update_end(&sq->stats.syncp); virtnet_xdp_put_sq(vi, sq); return ret; } static void put_xdp_frags(struct xdp_buff *xdp) { struct skb_shared_info *shinfo; struct page *xdp_page; int i; if (xdp_buff_has_frags(xdp)) { shinfo = xdp_get_shared_info_from_buff(xdp); for (i = 0; i < shinfo->nr_frags; i++) { xdp_page = skb_frag_page(&shinfo->frags[i]); put_page(xdp_page); } } } static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, struct net_device *dev, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct xdp_frame *xdpf; int err; u32 act; act = bpf_prog_run_xdp(xdp_prog, xdp); u64_stats_inc(&stats->xdp_packets); switch (act) { case XDP_PASS: return act; case XDP_TX: u64_stats_inc(&stats->xdp_tx); xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) { netdev_dbg(dev, "convert buff to frame failed for xdp\n"); return XDP_DROP; } err = virtnet_xdp_xmit(dev, 1, &xdpf, 0); if (unlikely(!err)) { xdp_return_frame_rx_napi(xdpf); } else if (unlikely(err < 0)) { trace_xdp_exception(dev, xdp_prog, act); return XDP_DROP; } *xdp_xmit |= VIRTIO_XDP_TX; return act; case XDP_REDIRECT: u64_stats_inc(&stats->xdp_redirects); err = xdp_do_redirect(dev, xdp, xdp_prog); if (err) return XDP_DROP; *xdp_xmit |= VIRTIO_XDP_REDIR; return act; default: bpf_warn_invalid_xdp_action(dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dev, xdp_prog, act); fallthrough; case XDP_DROP: return XDP_DROP; } } static unsigned int virtnet_get_headroom(struct virtnet_info *vi) { return vi->xdp_enabled ? XDP_PACKET_HEADROOM : 0; } /* We copy the packet for XDP in the following cases: * * 1) Packet is scattered across multiple rx buffers. * 2) Headroom space is insufficient. * * This is inefficient but it's a temporary condition that * we hit right after XDP is enabled and until queue is refilled * with large buffers with sufficient headroom - so it should affect * at most queue size packets. * Afterwards, the conditions to enable * XDP should preclude the underlying device from sending packets * across multiple buffers (num_buf > 1), and we make sure buffers * have enough headroom. */ static struct page *xdp_linearize_page(struct receive_queue *rq, int *num_buf, struct page *p, int offset, int page_off, unsigned int *len) { int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); struct page *page; if (page_off + *len + tailroom > PAGE_SIZE) return NULL; page = alloc_page(GFP_ATOMIC); if (!page) return NULL; memcpy(page_address(page) + page_off, page_address(p) + offset, *len); page_off += *len; while (--*num_buf) { unsigned int buflen; void *buf; int off; buf = virtnet_rq_get_buf(rq, &buflen, NULL); if (unlikely(!buf)) goto err_buf; p = virt_to_head_page(buf); off = buf - page_address(p); /* guard against a misconfigured or uncooperative backend that * is sending packet larger than the MTU. */ if ((page_off + buflen + tailroom) > PAGE_SIZE) { put_page(p); goto err_buf; } memcpy(page_address(page) + page_off, page_address(p) + off, buflen); page_off += buflen; put_page(p); } /* Headroom does not contribute to packet length */ *len = page_off - XDP_PACKET_HEADROOM; return page; err_buf: __free_pages(page, 0); return NULL; } static struct sk_buff *receive_small_build_skb(struct virtnet_info *vi, unsigned int xdp_headroom, void *buf, unsigned int len) { unsigned int header_offset; unsigned int headroom; unsigned int buflen; struct sk_buff *skb; header_offset = VIRTNET_RX_PAD + xdp_headroom; headroom = vi->hdr_len + header_offset; buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); skb = virtnet_build_skb(buf, buflen, headroom, len); if (unlikely(!skb)) return NULL; buf += header_offset; memcpy(skb_vnet_common_hdr(skb), buf, vi->hdr_len); return skb; } static struct sk_buff *receive_small_xdp(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, struct bpf_prog *xdp_prog, void *buf, unsigned int xdp_headroom, unsigned int len, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom; unsigned int headroom = vi->hdr_len + header_offset; struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset; struct page *page = virt_to_head_page(buf); struct page *xdp_page; unsigned int buflen; struct xdp_buff xdp; struct sk_buff *skb; unsigned int metasize = 0; u32 act; if (unlikely(hdr->hdr.gso_type)) goto err_xdp; /* Partially checksummed packets must be dropped. */ if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) goto err_xdp; buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) { int offset = buf - page_address(page) + header_offset; unsigned int tlen = len + vi->hdr_len; int num_buf = 1; xdp_headroom = virtnet_get_headroom(vi); header_offset = VIRTNET_RX_PAD + xdp_headroom; headroom = vi->hdr_len + header_offset; buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); xdp_page = xdp_linearize_page(rq, &num_buf, page, offset, header_offset, &tlen); if (!xdp_page) goto err_xdp; buf = page_address(xdp_page); put_page(page); page = xdp_page; } xdp_init_buff(&xdp, buflen, &rq->xdp_rxq); xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len, xdp_headroom, len, true); act = virtnet_xdp_handler(xdp_prog, &xdp, dev, xdp_xmit, stats); switch (act) { case XDP_PASS: /* Recalculate length in case bpf program changed it */ len = xdp.data_end - xdp.data; metasize = xdp.data - xdp.data_meta; break; case XDP_TX: case XDP_REDIRECT: goto xdp_xmit; default: goto err_xdp; } skb = virtnet_build_skb(buf, buflen, xdp.data - buf, len); if (unlikely(!skb)) goto err; if (metasize) skb_metadata_set(skb, metasize); return skb; err_xdp: u64_stats_inc(&stats->xdp_drops); err: u64_stats_inc(&stats->drops); put_page(page); xdp_xmit: return NULL; } static struct sk_buff *receive_small(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, void *buf, void *ctx, unsigned int len, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { unsigned int xdp_headroom = (unsigned long)ctx; struct page *page = virt_to_head_page(buf); struct sk_buff *skb; /* We passed the address of virtnet header to virtio-core, * so truncate the padding. */ buf -= VIRTNET_RX_PAD + xdp_headroom; len -= vi->hdr_len; u64_stats_add(&stats->bytes, len); if (unlikely(len > GOOD_PACKET_LEN)) { pr_debug("%s: rx error: len %u exceeds max size %d\n", dev->name, len, GOOD_PACKET_LEN); DEV_STATS_INC(dev, rx_length_errors); goto err; } if (unlikely(vi->xdp_enabled)) { struct bpf_prog *xdp_prog; rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { skb = receive_small_xdp(dev, vi, rq, xdp_prog, buf, xdp_headroom, len, xdp_xmit, stats); rcu_read_unlock(); return skb; } rcu_read_unlock(); } skb = receive_small_build_skb(vi, xdp_headroom, buf, len); if (likely(skb)) return skb; err: u64_stats_inc(&stats->drops); put_page(page); return NULL; } static struct sk_buff *receive_big(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, void *buf, unsigned int len, struct virtnet_rq_stats *stats) { struct page *page = buf; struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, 0); u64_stats_add(&stats->bytes, len - vi->hdr_len); if (unlikely(!skb)) goto err; return skb; err: u64_stats_inc(&stats->drops); give_pages(rq, page); return NULL; } static void mergeable_buf_free(struct receive_queue *rq, int num_buf, struct net_device *dev, struct virtnet_rq_stats *stats) { struct page *page; void *buf; int len; while (num_buf-- > 1) { buf = virtnet_rq_get_buf(rq, &len, NULL); if (unlikely(!buf)) { pr_debug("%s: rx error: %d buffers missing\n", dev->name, num_buf); DEV_STATS_INC(dev, rx_length_errors); break; } u64_stats_add(&stats->bytes, len); page = virt_to_head_page(buf); put_page(page); } } /* Why not use xdp_build_skb_from_frame() ? * XDP core assumes that xdp frags are PAGE_SIZE in length, while in * virtio-net there are 2 points that do not match its requirements: * 1. The size of the prefilled buffer is not fixed before xdp is set. * 2. xdp_build_skb_from_frame() does more checks that we don't need, * like eth_type_trans() (which virtio-net does in receive_buf()). */ static struct sk_buff *build_skb_from_xdp_buff(struct net_device *dev, struct virtnet_info *vi, struct xdp_buff *xdp, unsigned int xdp_frags_truesz) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); unsigned int headroom, data_len; struct sk_buff *skb; int metasize; u8 nr_frags; if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { pr_debug("Error building skb as missing reserved tailroom for xdp"); return NULL; } if (unlikely(xdp_buff_has_frags(xdp))) nr_frags = sinfo->nr_frags; skb = build_skb(xdp->data_hard_start, xdp->frame_sz); if (unlikely(!skb)) return NULL; headroom = xdp->data - xdp->data_hard_start; data_len = xdp->data_end - xdp->data; skb_reserve(skb, headroom); __skb_put(skb, data_len); metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; if (metasize) skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) xdp_update_skb_shared_info(skb, nr_frags, sinfo->xdp_frags_size, xdp_frags_truesz, xdp_buff_is_frag_pfmemalloc(xdp)); return skb; } /* TODO: build xdp in big mode */ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, struct xdp_buff *xdp, void *buf, unsigned int len, unsigned int frame_sz, int *num_buf, unsigned int *xdp_frags_truesize, struct virtnet_rq_stats *stats) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf; unsigned int headroom, tailroom, room; unsigned int truesize, cur_frag_size; struct skb_shared_info *shinfo; unsigned int xdp_frags_truesz = 0; struct page *page; skb_frag_t *frag; int offset; void *ctx; xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); xdp_prepare_buff(xdp, buf - XDP_PACKET_HEADROOM, XDP_PACKET_HEADROOM + vi->hdr_len, len - vi->hdr_len, true); if (!*num_buf) return 0; if (*num_buf > 1) { /* If we want to build multi-buffer xdp, we need * to specify that the flags of xdp_buff have the * XDP_FLAGS_HAS_FRAG bit. */ if (!xdp_buff_has_frags(xdp)) xdp_buff_set_frags_flag(xdp); shinfo = xdp_get_shared_info_from_buff(xdp); shinfo->nr_frags = 0; shinfo->xdp_frags_size = 0; } if (*num_buf > MAX_SKB_FRAGS + 1) return -EINVAL; while (--*num_buf > 0) { buf = virtnet_rq_get_buf(rq, &len, &ctx); if (unlikely(!buf)) { pr_debug("%s: rx error: %d buffers out of %d missing\n", dev->name, *num_buf, virtio16_to_cpu(vi->vdev, hdr->num_buffers)); DEV_STATS_INC(dev, rx_length_errors); goto err; } u64_stats_add(&stats->bytes, len); page = virt_to_head_page(buf); offset = buf - page_address(page); truesize = mergeable_ctx_to_truesize(ctx); headroom = mergeable_ctx_to_headroom(ctx); tailroom = headroom ? sizeof(struct skb_shared_info) : 0; room = SKB_DATA_ALIGN(headroom + tailroom); cur_frag_size = truesize; xdp_frags_truesz += cur_frag_size; if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { put_page(page); pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)(truesize - room)); DEV_STATS_INC(dev, rx_length_errors); goto err; } frag = &shinfo->frags[shinfo->nr_frags++]; skb_frag_fill_page_desc(frag, page, offset, len); if (page_is_pfmemalloc(page)) xdp_buff_set_frag_pfmemalloc(xdp); shinfo->xdp_frags_size += len; } *xdp_frags_truesize = xdp_frags_truesz; return 0; err: put_xdp_frags(xdp); return -EINVAL; } static void *mergeable_xdp_get_buf(struct virtnet_info *vi, struct receive_queue *rq, struct bpf_prog *xdp_prog, void *ctx, unsigned int *frame_sz, int *num_buf, struct page **page, int offset, unsigned int *len, struct virtio_net_hdr_mrg_rxbuf *hdr) { unsigned int truesize = mergeable_ctx_to_truesize(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx); struct page *xdp_page; unsigned int xdp_room; /* Transient failure which in theory could occur if * in-flight packets from before XDP was enabled reach * the receive path after XDP is loaded. */ if (unlikely(hdr->hdr.gso_type)) return NULL; /* Partially checksummed packets must be dropped. */ if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) return NULL; /* Now XDP core assumes frag size is PAGE_SIZE, but buffers * with headroom may add hole in truesize, which * make their length exceed PAGE_SIZE. So we disabled the * hole mechanism for xdp. See add_recvbuf_mergeable(). */ *frame_sz = truesize; if (likely(headroom >= virtnet_get_headroom(vi) && (*num_buf == 1 || xdp_prog->aux->xdp_has_frags))) { return page_address(*page) + offset; } /* This happens when headroom is not enough because * of the buffer was prefilled before XDP is set. * This should only happen for the first several packets. * In fact, vq reset can be used here to help us clean up * the prefilled buffers, but many existing devices do not * support it, and we don't want to bother users who are * using xdp normally. */ if (!xdp_prog->aux->xdp_has_frags) { /* linearize data for XDP */ xdp_page = xdp_linearize_page(rq, num_buf, *page, offset, XDP_PACKET_HEADROOM, len); if (!xdp_page) return NULL; } else { xdp_room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM + sizeof(struct skb_shared_info)); if (*len + xdp_room > PAGE_SIZE) return NULL; xdp_page = alloc_page(GFP_ATOMIC); if (!xdp_page) return NULL; memcpy(page_address(xdp_page) + XDP_PACKET_HEADROOM, page_address(*page) + offset, *len); } *frame_sz = PAGE_SIZE; put_page(*page); *page = xdp_page; return page_address(*page) + XDP_PACKET_HEADROOM; } static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, struct bpf_prog *xdp_prog, void *buf, void *ctx, unsigned int len, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf; int num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); struct page *page = virt_to_head_page(buf); int offset = buf - page_address(page); unsigned int xdp_frags_truesz = 0; struct sk_buff *head_skb; unsigned int frame_sz; struct xdp_buff xdp; void *data; u32 act; int err; data = mergeable_xdp_get_buf(vi, rq, xdp_prog, ctx, &frame_sz, &num_buf, &page, offset, &len, hdr); if (unlikely(!data)) goto err_xdp; err = virtnet_build_xdp_buff_mrg(dev, vi, rq, &xdp, data, len, frame_sz, &num_buf, &xdp_frags_truesz, stats); if (unlikely(err)) goto err_xdp; act = virtnet_xdp_handler(xdp_prog, &xdp, dev, xdp_xmit, stats); switch (act) { case XDP_PASS: head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); if (unlikely(!head_skb)) break; return head_skb; case XDP_TX: case XDP_REDIRECT: return NULL; default: break; } put_xdp_frags(&xdp); err_xdp: put_page(page); mergeable_buf_free(rq, num_buf, dev, stats); u64_stats_inc(&stats->xdp_drops); u64_stats_inc(&stats->drops); return NULL; } static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, struct sk_buff *curr_skb, struct page *page, void *buf, int len, int truesize) { int num_skb_frags; int offset; num_skb_frags = skb_shinfo(curr_skb)->nr_frags; if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); if (unlikely(!nskb)) return NULL; if (curr_skb == head_skb) skb_shinfo(curr_skb)->frag_list = nskb; else curr_skb->next = nskb; curr_skb = nskb; head_skb->truesize += nskb->truesize; num_skb_frags = 0; } if (curr_skb != head_skb) { head_skb->data_len += len; head_skb->len += len; head_skb->truesize += truesize; } offset = buf - page_address(page); if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { put_page(page); skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, len, truesize); } else { skb_add_rx_frag(curr_skb, num_skb_frags, page, offset, len, truesize); } return curr_skb; } static struct sk_buff *receive_mergeable(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, void *buf, void *ctx, unsigned int len, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf; int num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); struct page *page = virt_to_head_page(buf); int offset = buf - page_address(page); struct sk_buff *head_skb, *curr_skb; unsigned int truesize = mergeable_ctx_to_truesize(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx); unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; unsigned int room = SKB_DATA_ALIGN(headroom + tailroom); head_skb = NULL; u64_stats_add(&stats->bytes, len - vi->hdr_len); if (unlikely(len > truesize - room)) { pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)(truesize - room)); DEV_STATS_INC(dev, rx_length_errors); goto err_skb; } if (unlikely(vi->xdp_enabled)) { struct bpf_prog *xdp_prog; rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { head_skb = receive_mergeable_xdp(dev, vi, rq, xdp_prog, buf, ctx, len, xdp_xmit, stats); rcu_read_unlock(); return head_skb; } rcu_read_unlock(); } head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); curr_skb = head_skb; if (unlikely(!curr_skb)) goto err_skb; while (--num_buf) { buf = virtnet_rq_get_buf(rq, &len, &ctx); if (unlikely(!buf)) { pr_debug("%s: rx error: %d buffers out of %d missing\n", dev->name, num_buf, virtio16_to_cpu(vi->vdev, hdr->num_buffers)); DEV_STATS_INC(dev, rx_length_errors); goto err_buf; } u64_stats_add(&stats->bytes, len); page = virt_to_head_page(buf); truesize = mergeable_ctx_to_truesize(ctx); headroom = mergeable_ctx_to_headroom(ctx); tailroom = headroom ? sizeof(struct skb_shared_info) : 0; room = SKB_DATA_ALIGN(headroom + tailroom); if (unlikely(len > truesize - room)) { pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)(truesize - room)); DEV_STATS_INC(dev, rx_length_errors); goto err_skb; } curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page, buf, len, truesize); if (!curr_skb) goto err_skb; } ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len); return head_skb; err_skb: put_page(page); mergeable_buf_free(rq, num_buf, dev, stats); err_buf: u64_stats_inc(&stats->drops); dev_kfree_skb(head_skb); return NULL; } static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, struct sk_buff *skb) { enum pkt_hash_types rss_hash_type; if (!hdr_hash || !skb) return; switch (__le16_to_cpu(hdr_hash->hash_report)) { case VIRTIO_NET_HASH_REPORT_TCPv4: case VIRTIO_NET_HASH_REPORT_UDPv4: case VIRTIO_NET_HASH_REPORT_TCPv6: case VIRTIO_NET_HASH_REPORT_UDPv6: case VIRTIO_NET_HASH_REPORT_TCPv6_EX: case VIRTIO_NET_HASH_REPORT_UDPv6_EX: rss_hash_type = PKT_HASH_TYPE_L4; break; case VIRTIO_NET_HASH_REPORT_IPv4: case VIRTIO_NET_HASH_REPORT_IPv6: case VIRTIO_NET_HASH_REPORT_IPv6_EX: rss_hash_type = PKT_HASH_TYPE_L3; break; case VIRTIO_NET_HASH_REPORT_NONE: default: rss_hash_type = PKT_HASH_TYPE_NONE; } skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type); } static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, struct sk_buff *skb, u8 flags) { struct virtio_net_common_hdr *hdr; struct net_device *dev = vi->dev; hdr = skb_vnet_common_hdr(skb); if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report) virtio_skb_set_hash(&hdr->hash_v1_hdr, skb); if (flags & VIRTIO_NET_HDR_F_DATA_VALID) skb->ip_summed = CHECKSUM_UNNECESSARY; if (virtio_net_hdr_to_skb(skb, &hdr->hdr, virtio_is_little_endian(vi->vdev))) { net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n", dev->name, hdr->hdr.gso_type, hdr->hdr.gso_size); goto frame_err; } skb_record_rx_queue(skb, vq2rxq(rq->vq)); skb->protocol = eth_type_trans(skb, dev); pr_debug("Receiving skb proto 0x%04x len %i type %i\n", ntohs(skb->protocol), skb->len, skb->pkt_type); napi_gro_receive(&rq->napi, skb); return; frame_err: DEV_STATS_INC(dev, rx_frame_errors); dev_kfree_skb(skb); } static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, void *buf, unsigned int len, void **ctx, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct net_device *dev = vi->dev; struct sk_buff *skb; u8 flags; if (unlikely(len < vi->hdr_len + ETH_HLEN)) { pr_debug("%s: short packet %i\n", dev->name, len); DEV_STATS_INC(dev, rx_length_errors); virtnet_rq_free_buf(vi, rq, buf); return; } /* 1. Save the flags early, as the XDP program might overwrite them. * These flags ensure packets marked as VIRTIO_NET_HDR_F_DATA_VALID * stay valid after XDP processing. * 2. XDP doesn't work with partially checksummed packets (refer to * virtnet_xdp_set()), so packets marked as * VIRTIO_NET_HDR_F_NEEDS_CSUM get dropped during XDP processing. */ flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; if (vi->mergeable_rx_bufs) skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); else if (vi->big_packets) skb = receive_big(dev, vi, rq, buf, len, stats); else skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); if (unlikely(!skb)) return; virtnet_receive_done(vi, rq, skb, flags); } /* Unlike mergeable buffers, all buffers are allocated to the * same size, except for the headroom. For this reason we do * not need to use mergeable_len_to_ctx here - it is enough * to store the headroom as the context ignoring the truesize. */ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, gfp_t gfp) { char *buf; unsigned int xdp_headroom = virtnet_get_headroom(vi); void *ctx = (void *)(unsigned long)xdp_headroom; int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom; int err; len = SKB_DATA_ALIGN(len) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (unlikely(!skb_page_frag_refill(len, &rq->alloc_frag, gfp))) return -ENOMEM; buf = virtnet_rq_alloc(rq, len, gfp); if (unlikely(!buf)) return -ENOMEM; buf += VIRTNET_RX_PAD + xdp_headroom; virtnet_rq_init_one_sg(rq, buf, vi->hdr_len + GOOD_PACKET_LEN); err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) { virtnet_rq_unmap(rq, buf, 0); put_page(virt_to_head_page(buf)); } return err; } static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq, gfp_t gfp) { struct page *first, *list = NULL; char *p; int i, err, offset; sg_init_table(rq->sg, vi->big_packets_num_skbfrags + 2); /* page in rq->sg[vi->big_packets_num_skbfrags + 1] is list tail */ for (i = vi->big_packets_num_skbfrags + 1; i > 1; --i) { first = get_a_page(rq, gfp); if (!first) { if (list) give_pages(rq, list); return -ENOMEM; } sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE); /* chain new page in list head to match sg */ first->private = (unsigned long)list; list = first; } first = get_a_page(rq, gfp); if (!first) { give_pages(rq, list); return -ENOMEM; } p = page_address(first); /* rq->sg[0], rq->sg[1] share the same page */ /* a separated rq->sg[0] for header - required in case !any_header_sg */ sg_set_buf(&rq->sg[0], p, vi->hdr_len); /* rq->sg[1] for data packet, from offset */ offset = sizeof(struct padded_vnet_hdr); sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset); /* chain first in list head */ first->private = (unsigned long)list; err = virtqueue_add_inbuf(rq->vq, rq->sg, vi->big_packets_num_skbfrags + 2, first, gfp); if (err < 0) give_pages(rq, first); return err; } static unsigned int get_mergeable_buf_len(struct receive_queue *rq, struct ewma_pkt_len *avg_pkt_len, unsigned int room) { struct virtnet_info *vi = rq->vq->vdev->priv; const size_t hdr_len = vi->hdr_len; unsigned int len; if (room) return PAGE_SIZE - room; len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len), rq->min_buf_len, PAGE_SIZE - hdr_len); return ALIGN(len, L1_CACHE_BYTES); } static int add_recvbuf_mergeable(struct virtnet_info *vi, struct receive_queue *rq, gfp_t gfp) { struct page_frag *alloc_frag = &rq->alloc_frag; unsigned int headroom = virtnet_get_headroom(vi); unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; unsigned int room = SKB_DATA_ALIGN(headroom + tailroom); unsigned int len, hole; void *ctx; char *buf; int err; /* Extra tailroom is needed to satisfy XDP's assumption. This * means rx frags coalescing won't work, but consider we've * disabled GSO for XDP, it won't be a big issue. */ len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) return -ENOMEM; if (!alloc_frag->offset && len + room + sizeof(struct virtnet_rq_dma) > alloc_frag->size) len -= sizeof(struct virtnet_rq_dma); buf = virtnet_rq_alloc(rq, len + room, gfp); if (unlikely(!buf)) return -ENOMEM; buf += headroom; /* advance address leaving hole at front of pkt */ hole = alloc_frag->size - alloc_frag->offset; if (hole < len + room) { /* To avoid internal fragmentation, if there is very likely not * enough space for another buffer, add the remaining space to * the current buffer. * XDP core assumes that frame_size of xdp_buff and the length * of the frag are PAGE_SIZE, so we disable the hole mechanism. */ if (!headroom) len += hole; alloc_frag->offset += hole; } virtnet_rq_init_one_sg(rq, buf, len); ctx = mergeable_len_to_ctx(len + room, headroom); err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) { virtnet_rq_unmap(rq, buf, 0); put_page(virt_to_head_page(buf)); } return err; } /* * Returns false if we couldn't fill entirely (OOM). * * Normally run in the receive path, but can also be run from ndo_open * before we're receiving packets, or from refill_work which is * careful to disable receiving (using napi_disable). */ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, gfp_t gfp) { int err; if (rq->xsk_pool) { err = virtnet_add_recvbuf_xsk(vi, rq, rq->xsk_pool, gfp); goto kick; } do { if (vi->mergeable_rx_bufs) err = add_recvbuf_mergeable(vi, rq, gfp); else if (vi->big_packets) err = add_recvbuf_big(vi, rq, gfp); else err = add_recvbuf_small(vi, rq, gfp); if (err) break; } while (rq->vq->num_free); kick: if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) { unsigned long flags; flags = u64_stats_update_begin_irqsave(&rq->stats.syncp); u64_stats_inc(&rq->stats.kicks); u64_stats_update_end_irqrestore(&rq->stats.syncp, flags); } return err != -ENOMEM; } static void skb_recv_done(struct virtqueue *rvq) { struct virtnet_info *vi = rvq->vdev->priv; struct receive_queue *rq = &vi->rq[vq2rxq(rvq)]; rq->calls++; virtqueue_napi_schedule(&rq->napi, rvq); } static void virtnet_napi_do_enable(struct virtqueue *vq, struct napi_struct *napi) { napi_enable(napi); /* If all buffers were filled by other side before we napi_enabled, we * won't get another interrupt, so process any outstanding packets now. * Call local_bh_enable after to trigger softIRQ processing. */ local_bh_disable(); virtqueue_napi_schedule(napi, vq); local_bh_enable(); } static void virtnet_napi_enable(struct receive_queue *rq) { struct virtnet_info *vi = rq->vq->vdev->priv; int qidx = vq2rxq(rq->vq); virtnet_napi_do_enable(rq->vq, &rq->napi); netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, &rq->napi); } static void virtnet_napi_tx_enable(struct send_queue *sq) { struct virtnet_info *vi = sq->vq->vdev->priv; struct napi_struct *napi = &sq->napi; int qidx = vq2txq(sq->vq); if (!napi->weight) return; /* Tx napi touches cachelines on the cpu handling tx interrupts. Only * enable the feature if this is likely affine with the transmit path. */ if (!vi->affinity_hint_set) { napi->weight = 0; return; } virtnet_napi_do_enable(sq->vq, napi); netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, napi); } static void virtnet_napi_tx_disable(struct send_queue *sq) { struct virtnet_info *vi = sq->vq->vdev->priv; struct napi_struct *napi = &sq->napi; int qidx = vq2txq(sq->vq); if (napi->weight) { netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, NULL); napi_disable(napi); } } static void virtnet_napi_disable(struct receive_queue *rq) { struct virtnet_info *vi = rq->vq->vdev->priv; struct napi_struct *napi = &rq->napi; int qidx = vq2rxq(rq->vq); netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, NULL); napi_disable(napi); } static void refill_work(struct work_struct *work) { struct virtnet_info *vi = container_of(work, struct virtnet_info, refill.work); bool still_empty; int i; for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; /* * When queue API support is added in the future and the call * below becomes napi_disable_locked, this driver will need to * be refactored. * * One possible solution would be to: * - cancel refill_work with cancel_delayed_work (note: * non-sync) * - cancel refill_work with cancel_delayed_work_sync in * virtnet_remove after the netdev is unregistered * - wrap all of the work in a lock (perhaps the netdev * instance lock) * - check netif_running() and return early to avoid a race */ napi_disable(&rq->napi); still_empty = !try_fill_recv(vi, rq, GFP_KERNEL); virtnet_napi_do_enable(rq->vq, &rq->napi); /* In theory, this can happen: if we don't get any buffers in * we will *never* try to fill again. */ if (still_empty) schedule_delayed_work(&vi->refill, HZ/2); } } static int virtnet_receive_xsk_bufs(struct virtnet_info *vi, struct receive_queue *rq, int budget, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { unsigned int len; int packets = 0; void *buf; while (packets < budget) { buf = virtqueue_get_buf(rq->vq, &len); if (!buf) break; virtnet_receive_xsk_buf(vi, rq, buf, len, xdp_xmit, stats); packets++; } return packets; } static int virtnet_receive_packets(struct virtnet_info *vi, struct receive_queue *rq, int budget, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { unsigned int len; int packets = 0; void *buf; if (!vi->big_packets || vi->mergeable_rx_bufs) { void *ctx; while (packets < budget && (buf = virtnet_rq_get_buf(rq, &len, &ctx))) { receive_buf(vi, rq, buf, len, ctx, xdp_xmit, stats); packets++; } } else { while (packets < budget && (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { receive_buf(vi, rq, buf, len, NULL, xdp_xmit, stats); packets++; } } return packets; } static int virtnet_receive(struct receive_queue *rq, int budget, unsigned int *xdp_xmit) { struct virtnet_info *vi = rq->vq->vdev->priv; struct virtnet_rq_stats stats = {}; int i, packets; if (rq->xsk_pool) packets = virtnet_receive_xsk_bufs(vi, rq, budget, xdp_xmit, &stats); else packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats); if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) { if (!try_fill_recv(vi, rq, GFP_ATOMIC)) { spin_lock(&vi->refill_lock); if (vi->refill_enabled) schedule_delayed_work(&vi->refill, 0); spin_unlock(&vi->refill_lock); } } u64_stats_set(&stats.packets, packets); u64_stats_update_begin(&rq->stats.syncp); for (i = 0; i < ARRAY_SIZE(virtnet_rq_stats_desc); i++) { size_t offset = virtnet_rq_stats_desc[i].offset; u64_stats_t *item, *src; item = (u64_stats_t *)((u8 *)&rq->stats + offset); src = (u64_stats_t *)((u8 *)&stats + offset); u64_stats_add(item, u64_stats_read(src)); } u64_stats_add(&rq->stats.packets, u64_stats_read(&stats.packets)); u64_stats_add(&rq->stats.bytes, u64_stats_read(&stats.bytes)); u64_stats_update_end(&rq->stats.syncp); return packets; } static void virtnet_poll_cleantx(struct receive_queue *rq, int budget) { struct virtnet_info *vi = rq->vq->vdev->priv; unsigned int index = vq2rxq(rq->vq); struct send_queue *sq = &vi->sq[index]; struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index); if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index)) return; if (__netif_tx_trylock(txq)) { if (sq->reset) { __netif_tx_unlock(txq); return; } do { virtqueue_disable_cb(sq->vq); free_old_xmit(sq, txq, !!budget); } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { if (netif_tx_queue_stopped(txq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.wake); u64_stats_update_end(&sq->stats.syncp); } netif_tx_wake_queue(txq); } __netif_tx_unlock(txq); } } static void virtnet_rx_dim_update(struct virtnet_info *vi, struct receive_queue *rq) { struct dim_sample cur_sample = {}; if (!rq->packets_in_napi) return; /* Don't need protection when fetching stats, since fetcher and * updater of the stats are in same context */ dim_update_sample(rq->calls, u64_stats_read(&rq->stats.packets), u64_stats_read(&rq->stats.bytes), &cur_sample); net_dim(&rq->dim, &cur_sample); rq->packets_in_napi = 0; } static int virtnet_poll(struct napi_struct *napi, int budget) { struct receive_queue *rq = container_of(napi, struct receive_queue, napi); struct virtnet_info *vi = rq->vq->vdev->priv; struct send_queue *sq; unsigned int received; unsigned int xdp_xmit = 0; bool napi_complete; virtnet_poll_cleantx(rq, budget); received = virtnet_receive(rq, budget, &xdp_xmit); rq->packets_in_napi += received; if (xdp_xmit & VIRTIO_XDP_REDIR) xdp_do_flush(); /* Out of packets? */ if (received < budget) { napi_complete = virtqueue_napi_complete(napi, rq->vq, received); /* Intentionally not taking dim_lock here. This may result in a * spurious net_dim call. But if that happens virtnet_rx_dim_work * will not act on the scheduled work. */ if (napi_complete && rq->dim_enabled) virtnet_rx_dim_update(vi, rq); } if (xdp_xmit & VIRTIO_XDP_TX) { sq = virtnet_xdp_get_sq(vi); if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.kicks); u64_stats_update_end(&sq->stats.syncp); } virtnet_xdp_put_sq(vi, sq); } return received; } static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index) { virtnet_napi_tx_disable(&vi->sq[qp_index]); virtnet_napi_disable(&vi->rq[qp_index]); xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq); } static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) { struct net_device *dev = vi->dev; int err; err = xdp_rxq_info_reg(&vi->rq[qp_index].xdp_rxq, dev, qp_index, vi->rq[qp_index].napi.napi_id); if (err < 0) return err; err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) goto err_xdp_reg_mem_model; virtnet_napi_enable(&vi->rq[qp_index]); virtnet_napi_tx_enable(&vi->sq[qp_index]); return 0; err_xdp_reg_mem_model: xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq); return err; } static void virtnet_cancel_dim(struct virtnet_info *vi, struct dim *dim) { if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) return; net_dim_work_cancel(dim); } static void virtnet_update_settings(struct virtnet_info *vi) { u32 speed; u8 duplex; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX)) return; virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed); if (ethtool_validate_speed(speed)) vi->speed = speed; virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex); if (ethtool_validate_duplex(duplex)) vi->duplex = duplex; } static int virtnet_open(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); int i, err; enable_delayed_refill(vi); for (i = 0; i < vi->max_queue_pairs; i++) { if (i < vi->curr_queue_pairs) /* Make sure we have some buffers: if oom use wq. */ if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); err = virtnet_enable_queue_pair(vi, i); if (err < 0) goto err_enable_qp; } if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { if (vi->status & VIRTIO_NET_S_LINK_UP) netif_carrier_on(vi->dev); virtio_config_driver_enable(vi->vdev); } else { vi->status = VIRTIO_NET_S_LINK_UP; netif_carrier_on(dev); } return 0; err_enable_qp: disable_delayed_refill(vi); cancel_delayed_work_sync(&vi->refill); for (i--; i >= 0; i--) { virtnet_disable_queue_pair(vi, i); virtnet_cancel_dim(vi, &vi->rq[i].dim); } return err; } static int virtnet_poll_tx(struct napi_struct *napi, int budget) { struct send_queue *sq = container_of(napi, struct send_queue, napi); struct virtnet_info *vi = sq->vq->vdev->priv; unsigned int index = vq2txq(sq->vq); struct netdev_queue *txq; int opaque, xsk_done = 0; bool done; if (unlikely(is_xdp_raw_buffer_queue(vi, index))) { /* We don't need to enable cb for XDP */ napi_complete_done(napi, 0); return 0; } txq = netdev_get_tx_queue(vi->dev, index); __netif_tx_lock(txq, raw_smp_processor_id()); virtqueue_disable_cb(sq->vq); if (sq->xsk_pool) xsk_done = virtnet_xsk_xmit(sq, sq->xsk_pool, budget); else free_old_xmit(sq, txq, !!budget); if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { if (netif_tx_queue_stopped(txq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.wake); u64_stats_update_end(&sq->stats.syncp); } netif_tx_wake_queue(txq); } if (xsk_done >= budget) { __netif_tx_unlock(txq); return budget; } opaque = virtqueue_enable_cb_prepare(sq->vq); done = napi_complete_done(napi, 0); if (!done) virtqueue_disable_cb(sq->vq); __netif_tx_unlock(txq); if (done) { if (unlikely(virtqueue_poll(sq->vq, opaque))) { if (napi_schedule_prep(napi)) { __netif_tx_lock(txq, raw_smp_processor_id()); virtqueue_disable_cb(sq->vq); __netif_tx_unlock(txq); __napi_schedule(napi); } } } return 0; } static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) { struct virtio_net_hdr_mrg_rxbuf *hdr; const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; struct virtnet_info *vi = sq->vq->vdev->priv; int num_sg; unsigned hdr_len = vi->hdr_len; bool can_push; pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); can_push = vi->any_header_sg && !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) && !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len; /* Even if we can, don't push here yet as this would skew * csum_start offset below. */ if (can_push) hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len); else hdr = &skb_vnet_common_hdr(skb)->mrg_hdr; if (virtio_net_hdr_from_skb(skb, &hdr->hdr, virtio_is_little_endian(vi->vdev), false, 0)) return -EPROTO; if (vi->mergeable_rx_bufs) hdr->num_buffers = 0; sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2)); if (can_push) { __skb_push(skb, hdr_len); num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len); if (unlikely(num_sg < 0)) return num_sg; /* Pull header back to avoid skew in tx bytes calculations. */ __skb_pull(skb, hdr_len); } else { sg_set_buf(sq->sg, hdr, hdr_len); num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len); if (unlikely(num_sg < 0)) return num_sg; num_sg++; } return virtnet_add_outbuf(sq, num_sg, skb, orphan ? VIRTNET_XMIT_TYPE_SKB_ORPHAN : VIRTNET_XMIT_TYPE_SKB); } static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); int qnum = skb_get_queue_mapping(skb); struct send_queue *sq = &vi->sq[qnum]; int err; struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); bool xmit_more = netdev_xmit_more(); bool use_napi = sq->napi.weight; bool kick; if (!use_napi) free_old_xmit(sq, txq, false); else virtqueue_disable_cb(sq->vq); /* timestamp packet in software */ skb_tx_timestamp(skb); /* Try to transmit */ err = xmit_skb(sq, skb, !use_napi); /* This should not happen! */ if (unlikely(err)) { DEV_STATS_INC(dev, tx_fifo_errors); if (net_ratelimit()) dev_warn(&dev->dev, "Unexpected TXQ (%d) queue failure: %d\n", qnum, err); DEV_STATS_INC(dev, tx_dropped); dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* Don't wait up for transmitted skbs to be freed. */ if (!use_napi) { skb_orphan(skb); nf_reset_ct(skb); } if (use_napi) tx_may_stop(vi, dev, sq); else check_sq_full_and_disable(vi, dev,sq); kick = use_napi ? __netdev_tx_sent_queue(txq, skb->len, xmit_more) : !xmit_more || netif_xmit_stopped(txq); if (kick) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.kicks); u64_stats_update_end(&sq->stats.syncp); } } if (use_napi && kick && unlikely(!virtqueue_enable_cb_delayed(sq->vq))) virtqueue_napi_schedule(&sq->napi, sq->vq); return NETDEV_TX_OK; } static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) { bool running = netif_running(vi->dev); if (running) { virtnet_napi_disable(rq); virtnet_cancel_dim(vi, &rq->dim); } } static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq) { bool running = netif_running(vi->dev); if (!try_fill_recv(vi, rq, GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); if (running) virtnet_napi_enable(rq); } static int virtnet_rx_resize(struct virtnet_info *vi, struct receive_queue *rq, u32 ring_num) { int err, qindex; qindex = rq - vi->rq; virtnet_rx_pause(vi, rq); err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf, NULL); if (err) netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); virtnet_rx_resume(vi, rq); return err; } static void virtnet_tx_pause(struct virtnet_info *vi, struct send_queue *sq) { bool running = netif_running(vi->dev); struct netdev_queue *txq; int qindex; qindex = sq - vi->sq; if (running) virtnet_napi_tx_disable(sq); txq = netdev_get_tx_queue(vi->dev, qindex); /* 1. wait all ximt complete * 2. fix the race of netif_stop_subqueue() vs netif_start_subqueue() */ __netif_tx_lock_bh(txq); /* Prevent rx poll from accessing sq. */ sq->reset = true; /* Prevent the upper layer from trying to send packets. */ netif_stop_subqueue(vi->dev, qindex); __netif_tx_unlock_bh(txq); } static void virtnet_tx_resume(struct virtnet_info *vi, struct send_queue *sq) { bool running = netif_running(vi->dev); struct netdev_queue *txq; int qindex; qindex = sq - vi->sq; txq = netdev_get_tx_queue(vi->dev, qindex); __netif_tx_lock_bh(txq); sq->reset = false; netif_tx_wake_queue(txq); __netif_tx_unlock_bh(txq); if (running) virtnet_napi_tx_enable(sq); } static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq, u32 ring_num) { int qindex, err; qindex = sq - vi->sq; virtnet_tx_pause(vi, sq); err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf, virtnet_sq_free_unused_buf_done); if (err) netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); virtnet_tx_resume(vi, sq); return err; } /* * Send command via the control virtqueue and check status. Commands * supported by the hypervisor, as indicated by feature bits, should * never fail unless improperly formatted. */ static bool virtnet_send_command_reply(struct virtnet_info *vi, u8 class, u8 cmd, struct scatterlist *out, struct scatterlist *in) { struct scatterlist *sgs[5], hdr, stat; u32 out_num = 0, tmp, in_num = 0; bool ok; int ret; /* Caller should know better */ BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); mutex_lock(&vi->cvq_lock); vi->ctrl->status = ~0; vi->ctrl->hdr.class = class; vi->ctrl->hdr.cmd = cmd; /* Add header */ sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr)); sgs[out_num++] = &hdr; if (out) sgs[out_num++] = out; /* Add return status. */ sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status)); sgs[out_num + in_num++] = &stat; if (in) sgs[out_num + in_num++] = in; BUG_ON(out_num + in_num > ARRAY_SIZE(sgs)); ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC); if (ret < 0) { dev_warn(&vi->vdev->dev, "Failed to add sgs for command vq: %d\n.", ret); mutex_unlock(&vi->cvq_lock); return false; } if (unlikely(!virtqueue_kick(vi->cvq))) goto unlock; /* Spin for a response, the kick causes an ioport write, trapping * into the hypervisor, so the request should be handled immediately. */ while (!virtqueue_get_buf(vi->cvq, &tmp) && !virtqueue_is_broken(vi->cvq)) { cond_resched(); cpu_relax(); } unlock: ok = vi->ctrl->status == VIRTIO_NET_OK; mutex_unlock(&vi->cvq_lock); return ok; } static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, struct scatterlist *out) { return virtnet_send_command_reply(vi, class, cmd, out, NULL); } static int virtnet_set_mac_address(struct net_device *dev, void *p) { struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi->vdev; int ret; struct sockaddr *addr; struct scatterlist sg; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY)) return -EOPNOTSUPP; addr = kmemdup(p, sizeof(*addr), GFP_KERNEL); if (!addr) return -ENOMEM; ret = eth_prepare_mac_addr_change(dev, addr); if (ret) goto out; if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { sg_init_one(&sg, addr->sa_data, dev->addr_len); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) { dev_warn(&vdev->dev, "Failed to set mac address by vq command.\n"); ret = -EINVAL; goto out; } } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) && !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { unsigned int i; /* Naturally, this has an atomicity problem. */ for (i = 0; i < dev->addr_len; i++) virtio_cwrite8(vdev, offsetof(struct virtio_net_config, mac) + i, addr->sa_data[i]); } eth_commit_mac_addr_change(dev, p); ret = 0; out: kfree(addr); return ret; } static void virtnet_stats(struct net_device *dev, struct rtnl_link_stats64 *tot) { struct virtnet_info *vi = netdev_priv(dev); unsigned int start; int i; for (i = 0; i < vi->max_queue_pairs; i++) { u64 tpackets, tbytes, terrors, rpackets, rbytes, rdrops; struct receive_queue *rq = &vi->rq[i]; struct send_queue *sq = &vi->sq[i]; do { start = u64_stats_fetch_begin(&sq->stats.syncp); tpackets = u64_stats_read(&sq->stats.packets); tbytes = u64_stats_read(&sq->stats.bytes); terrors = u64_stats_read(&sq->stats.tx_timeouts); } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); do { start = u64_stats_fetch_begin(&rq->stats.syncp); rpackets = u64_stats_read(&rq->stats.packets); rbytes = u64_stats_read(&rq->stats.bytes); rdrops = u64_stats_read(&rq->stats.drops); } while (u64_stats_fetch_retry(&rq->stats.syncp, start)); tot->rx_packets += rpackets; tot->tx_packets += tpackets; tot->rx_bytes += rbytes; tot->tx_bytes += tbytes; tot->rx_dropped += rdrops; tot->tx_errors += terrors; } tot->tx_dropped = DEV_STATS_READ(dev, tx_dropped); tot->tx_fifo_errors = DEV_STATS_READ(dev, tx_fifo_errors); tot->rx_length_errors = DEV_STATS_READ(dev, rx_length_errors); tot->rx_frame_errors = DEV_STATS_READ(dev, rx_frame_errors); } static void virtnet_ack_link_announce(struct virtnet_info *vi) { if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL)) dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); } static bool virtnet_commit_rss_command(struct virtnet_info *vi); static void virtnet_rss_update_by_qpairs(struct virtnet_info *vi, u16 queue_pairs) { u32 indir_val = 0; int i = 0; for (; i < vi->rss_indir_table_size; ++i) { indir_val = ethtool_rxfh_indir_default(i, queue_pairs); vi->rss_hdr->indirection_table[i] = cpu_to_le16(indir_val); } vi->rss_trailer.max_tx_vq = cpu_to_le16(queue_pairs); } static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) { struct virtio_net_ctrl_mq *mq __free(kfree) = NULL; struct virtio_net_rss_config_hdr *old_rss_hdr; struct virtio_net_rss_config_trailer old_rss_trailer; struct net_device *dev = vi->dev; struct scatterlist sg; if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) return 0; /* Firstly check if we need update rss. Do updating if both (1) rss enabled and * (2) no user configuration. * * During rss command processing, device updates queue_pairs using rss.max_tx_vq. That is, * the device updates queue_pairs together with rss, so we can skip the sperate queue_pairs * update (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly. */ if (vi->has_rss && !netif_is_rxfh_configured(dev)) { old_rss_hdr = vi->rss_hdr; old_rss_trailer = vi->rss_trailer; vi->rss_hdr = devm_kzalloc(&dev->dev, virtnet_rss_hdr_size(vi), GFP_KERNEL); if (!vi->rss_hdr) { vi->rss_hdr = old_rss_hdr; return -ENOMEM; } *vi->rss_hdr = *old_rss_hdr; virtnet_rss_update_by_qpairs(vi, queue_pairs); if (!virtnet_commit_rss_command(vi)) { /* restore ctrl_rss if commit_rss_command failed */ devm_kfree(&dev->dev, vi->rss_hdr); vi->rss_hdr = old_rss_hdr; vi->rss_trailer = old_rss_trailer; dev_warn(&dev->dev, "Fail to set num of queue pairs to %d, because committing RSS failed\n", queue_pairs); return -EINVAL; } devm_kfree(&dev->dev, old_rss_hdr); goto succ; } mq = kzalloc(sizeof(*mq), GFP_KERNEL); if (!mq) return -ENOMEM; mq->virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs); sg_init_one(&sg, mq, sizeof(*mq)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) { dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", queue_pairs); return -EINVAL; } succ: vi->curr_queue_pairs = queue_pairs; /* virtnet_open() will refill when device is going to up. */ if (dev->flags & IFF_UP) schedule_delayed_work(&vi->refill, 0); return 0; } static int virtnet_close(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); int i; /* Make sure NAPI doesn't schedule refill work */ disable_delayed_refill(vi); /* Make sure refill_work doesn't re-enable napi! */ cancel_delayed_work_sync(&vi->refill); /* Prevent the config change callback from changing carrier * after close */ virtio_config_driver_disable(vi->vdev); /* Stop getting status/speed updates: we don't care until next * open */ cancel_work_sync(&vi->config_work); for (i = 0; i < vi->max_queue_pairs; i++) { virtnet_disable_queue_pair(vi, i); virtnet_cancel_dim(vi, &vi->rq[i].dim); } netif_carrier_off(dev); return 0; } static void virtnet_rx_mode_work(struct work_struct *work) { struct virtnet_info *vi = container_of(work, struct virtnet_info, rx_mode_work); u8 *promisc_allmulti __free(kfree) = NULL; struct net_device *dev = vi->dev; struct scatterlist sg[2]; struct virtio_net_ctrl_mac *mac_data; struct netdev_hw_addr *ha; int uc_count; int mc_count; void *buf; int i; /* We can't dynamically set ndo_set_rx_mode, so return gracefully */ if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) return; promisc_allmulti = kzalloc(sizeof(*promisc_allmulti), GFP_KERNEL); if (!promisc_allmulti) { dev_warn(&dev->dev, "Failed to set RX mode, no memory.\n"); return; } rtnl_lock(); *promisc_allmulti = !!(dev->flags & IFF_PROMISC); sg_init_one(sg, promisc_allmulti, sizeof(*promisc_allmulti)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_PROMISC, sg)) dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", *promisc_allmulti ? "en" : "dis"); *promisc_allmulti = !!(dev->flags & IFF_ALLMULTI); sg_init_one(sg, promisc_allmulti, sizeof(*promisc_allmulti)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_ALLMULTI, sg)) dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", *promisc_allmulti ? "en" : "dis"); netif_addr_lock_bh(dev); uc_count = netdev_uc_count(dev); mc_count = netdev_mc_count(dev); /* MAC filter - use one buffer for both lists */ buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) + (2 * sizeof(mac_data->entries)), GFP_ATOMIC); mac_data = buf; if (!buf) { netif_addr_unlock_bh(dev); rtnl_unlock(); return; } sg_init_table(sg, 2); /* Store the unicast list and count in the front of the buffer */ mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count); i = 0; netdev_for_each_uc_addr(ha, dev) memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); sg_set_buf(&sg[0], mac_data, sizeof(mac_data->entries) + (uc_count * ETH_ALEN)); /* multicast list and count fill the end */ mac_data = (void *)&mac_data->macs[uc_count][0]; mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count); i = 0; netdev_for_each_mc_addr(ha, dev) memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); netif_addr_unlock_bh(dev); sg_set_buf(&sg[1], mac_data, sizeof(mac_data->entries) + (mc_count * ETH_ALEN)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_TABLE_SET, sg)) dev_warn(&dev->dev, "Failed to set MAC filter table.\n"); rtnl_unlock(); kfree(buf); } static void virtnet_set_rx_mode(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); if (vi->rx_mode_work_enabled) schedule_work(&vi->rx_mode_work); } static int virtnet_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct virtnet_info *vi = netdev_priv(dev); __virtio16 *_vid __free(kfree) = NULL; struct scatterlist sg; _vid = kzalloc(sizeof(*_vid), GFP_KERNEL); if (!_vid) return -ENOMEM; *_vid = cpu_to_virtio16(vi->vdev, vid); sg_init_one(&sg, _vid, sizeof(*_vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, VIRTIO_NET_CTRL_VLAN_ADD, &sg)) dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); return 0; } static int virtnet_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct virtnet_info *vi = netdev_priv(dev); __virtio16 *_vid __free(kfree) = NULL; struct scatterlist sg; _vid = kzalloc(sizeof(*_vid), GFP_KERNEL); if (!_vid) return -ENOMEM; *_vid = cpu_to_virtio16(vi->vdev, vid); sg_init_one(&sg, _vid, sizeof(*_vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, VIRTIO_NET_CTRL_VLAN_DEL, &sg)) dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); return 0; } static void virtnet_clean_affinity(struct virtnet_info *vi) { int i; if (vi->affinity_hint_set) { for (i = 0; i < vi->max_queue_pairs; i++) { virtqueue_set_affinity(vi->rq[i].vq, NULL); virtqueue_set_affinity(vi->sq[i].vq, NULL); } vi->affinity_hint_set = false; } } static void virtnet_set_affinity(struct virtnet_info *vi) { cpumask_var_t mask; int stragglers; int group_size; int i, start = 0, cpu; int num_cpu; int stride; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { virtnet_clean_affinity(vi); return; } num_cpu = num_online_cpus(); stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1); stragglers = num_cpu >= vi->curr_queue_pairs ? num_cpu % vi->curr_queue_pairs : 0; for (i = 0; i < vi->curr_queue_pairs; i++) { group_size = stride + (i < stragglers ? 1 : 0); for_each_online_cpu_wrap(cpu, start) { if (!group_size--) { start = cpu; break; } cpumask_set_cpu(cpu, mask); } virtqueue_set_affinity(vi->rq[i].vq, mask); virtqueue_set_affinity(vi->sq[i].vq, mask); __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS); cpumask_clear(mask); } vi->affinity_hint_set = true; free_cpumask_var(mask); } static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node) { struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, node); virtnet_set_affinity(vi); return 0; } static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node) { struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, node_dead); virtnet_set_affinity(vi); return 0; } static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node) { struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, node); virtnet_clean_affinity(vi); return 0; } static enum cpuhp_state virtionet_online; static int virtnet_cpu_notif_add(struct virtnet_info *vi) { int ret; ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node); if (ret) return ret; ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD, &vi->node_dead); if (!ret) return ret; cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node); return ret; } static void virtnet_cpu_notif_remove(struct virtnet_info *vi) { cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node); cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD, &vi->node_dead); } static int virtnet_send_ctrl_coal_vq_cmd(struct virtnet_info *vi, u16 vqn, u32 max_usecs, u32 max_packets) { struct virtio_net_ctrl_coal_vq *coal_vq __free(kfree) = NULL; struct scatterlist sgs; coal_vq = kzalloc(sizeof(*coal_vq), GFP_KERNEL); if (!coal_vq) return -ENOMEM; coal_vq->vqn = cpu_to_le16(vqn); coal_vq->coal.max_usecs = cpu_to_le32(max_usecs); coal_vq->coal.max_packets = cpu_to_le32(max_packets); sg_init_one(&sgs, coal_vq, sizeof(*coal_vq)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, VIRTIO_NET_CTRL_NOTF_COAL_VQ_SET, &sgs)) return -EINVAL; return 0; } static int virtnet_send_rx_ctrl_coal_vq_cmd(struct virtnet_info *vi, u16 queue, u32 max_usecs, u32 max_packets) { int err; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) return -EOPNOTSUPP; err = virtnet_send_ctrl_coal_vq_cmd(vi, rxq2vq(queue), max_usecs, max_packets); if (err) return err; vi->rq[queue].intr_coal.max_usecs = max_usecs; vi->rq[queue].intr_coal.max_packets = max_packets; return 0; } static int virtnet_send_tx_ctrl_coal_vq_cmd(struct virtnet_info *vi, u16 queue, u32 max_usecs, u32 max_packets) { int err; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) return -EOPNOTSUPP; err = virtnet_send_ctrl_coal_vq_cmd(vi, txq2vq(queue), max_usecs, max_packets); if (err) return err; vi->sq[queue].intr_coal.max_usecs = max_usecs; vi->sq[queue].intr_coal.max_packets = max_packets; return 0; } static void virtnet_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ring, struct kernel_ethtool_ringparam *kernel_ring, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); ring->rx_max_pending = vi->rq[0].vq->num_max; ring->tx_max_pending = vi->sq[0].vq->num_max; ring->rx_pending = virtqueue_get_vring_size(vi->rq[0].vq); ring->tx_pending = virtqueue_get_vring_size(vi->sq[0].vq); } static int virtnet_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ring, struct kernel_ethtool_ringparam *kernel_ring, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); u32 rx_pending, tx_pending; struct receive_queue *rq; struct send_queue *sq; int i, err; if (ring->rx_mini_pending || ring->rx_jumbo_pending) return -EINVAL; rx_pending = virtqueue_get_vring_size(vi->rq[0].vq); tx_pending = virtqueue_get_vring_size(vi->sq[0].vq); if (ring->rx_pending == rx_pending && ring->tx_pending == tx_pending) return 0; if (ring->rx_pending > vi->rq[0].vq->num_max) return -EINVAL; if (ring->tx_pending > vi->sq[0].vq->num_max) return -EINVAL; for (i = 0; i < vi->max_queue_pairs; i++) { rq = vi->rq + i; sq = vi->sq + i; if (ring->tx_pending != tx_pending) { err = virtnet_tx_resize(vi, sq, ring->tx_pending); if (err) return err; /* Upon disabling and re-enabling a transmit virtqueue, the device must * set the coalescing parameters of the virtqueue to those configured * through the VIRTIO_NET_CTRL_NOTF_COAL_TX_SET command, or, if the driver * did not set any TX coalescing parameters, to 0. */ err = virtnet_send_tx_ctrl_coal_vq_cmd(vi, i, vi->intr_coal_tx.max_usecs, vi->intr_coal_tx.max_packets); /* Don't break the tx resize action if the vq coalescing is not * supported. The same is true for rx resize below. */ if (err && err != -EOPNOTSUPP) return err; } if (ring->rx_pending != rx_pending) { err = virtnet_rx_resize(vi, rq, ring->rx_pending); if (err) return err; /* The reason is same as the transmit virtqueue reset */ mutex_lock(&vi->rq[i].dim_lock); err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, i, vi->intr_coal_rx.max_usecs, vi->intr_coal_rx.max_packets); mutex_unlock(&vi->rq[i].dim_lock); if (err && err != -EOPNOTSUPP) return err; } } return 0; } static bool virtnet_commit_rss_command(struct virtnet_info *vi) { struct net_device *dev = vi->dev; struct scatterlist sgs[2]; /* prepare sgs */ sg_init_table(sgs, 2); sg_set_buf(&sgs[0], vi->rss_hdr, virtnet_rss_hdr_size(vi)); sg_set_buf(&sgs[1], &vi->rss_trailer, virtnet_rss_trailer_size(vi)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, vi->has_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG : VIRTIO_NET_CTRL_MQ_HASH_CONFIG, sgs)) goto err; return true; err: dev_warn(&dev->dev, "VIRTIONET issue with committing RSS sgs\n"); return false; } static void virtnet_init_default_rss(struct virtnet_info *vi) { vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_supported); vi->rss_hash_types_saved = vi->rss_hash_types_supported; vi->rss_hdr->indirection_table_mask = vi->rss_indir_table_size ? cpu_to_le16(vi->rss_indir_table_size - 1) : 0; vi->rss_hdr->unclassified_queue = 0; virtnet_rss_update_by_qpairs(vi, vi->curr_queue_pairs); vi->rss_trailer.hash_key_length = vi->rss_key_size; netdev_rss_key_fill(vi->rss_hash_key_data, vi->rss_key_size); } static void virtnet_get_hashflow(const struct virtnet_info *vi, struct ethtool_rxnfc *info) { info->data = 0; switch (info->flow_type) { case TCP_V4_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) { info->data = RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3; } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) { info->data = RXH_IP_SRC | RXH_IP_DST; } break; case TCP_V6_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) { info->data = RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3; } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) { info->data = RXH_IP_SRC | RXH_IP_DST; } break; case UDP_V4_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) { info->data = RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3; } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) { info->data = RXH_IP_SRC | RXH_IP_DST; } break; case UDP_V6_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) { info->data = RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3; } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) { info->data = RXH_IP_SRC | RXH_IP_DST; } break; case IPV4_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) info->data = RXH_IP_SRC | RXH_IP_DST; break; case IPV6_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) info->data = RXH_IP_SRC | RXH_IP_DST; break; default: info->data = 0; break; } } static bool virtnet_set_hashflow(struct virtnet_info *vi, struct ethtool_rxnfc *info) { u32 new_hashtypes = vi->rss_hash_types_saved; bool is_disable = info->data & RXH_DISCARD; bool is_l4 = info->data == (RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3); /* supports only 'sd', 'sdfn' and 'r' */ if (!((info->data == (RXH_IP_SRC | RXH_IP_DST)) | is_l4 | is_disable)) return false; switch (info->flow_type) { case TCP_V4_FLOW: new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_TCPv4); if (!is_disable) new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv4 : 0); break; case UDP_V4_FLOW: new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_UDPv4); if (!is_disable) new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv4 : 0); break; case IPV4_FLOW: new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv4; if (!is_disable) new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv4; break; case TCP_V6_FLOW: new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_TCPv6); if (!is_disable) new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv6 : 0); break; case UDP_V6_FLOW: new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_UDPv6); if (!is_disable) new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv6 : 0); break; case IPV6_FLOW: new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv6; if (!is_disable) new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv6; break; default: /* unsupported flow */ return false; } /* if unsupported hashtype was set */ if (new_hashtypes != (new_hashtypes & vi->rss_hash_types_supported)) return false; if (new_hashtypes != vi->rss_hash_types_saved) { vi->rss_hash_types_saved = new_hashtypes; vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_saved); if (vi->dev->features & NETIF_F_RXHASH) return virtnet_commit_rss_command(vi); } return true; } static void virtnet_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi->vdev; strscpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); strscpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version)); strscpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info)); } /* TODO: Eliminate OOO packets during switching */ static int virtnet_set_channels(struct net_device *dev, struct ethtool_channels *channels) { struct virtnet_info *vi = netdev_priv(dev); u16 queue_pairs = channels->combined_count; int err; /* We don't support separate rx/tx channels. * We don't allow setting 'other' channels. */ if (channels->rx_count || channels->tx_count || channels->other_count) return -EINVAL; if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0) return -EINVAL; /* For now we don't support modifying channels while XDP is loaded * also when XDP is loaded all RX queues have XDP programs so we only * need to check a single RX queue. */ if (vi->rq[0].xdp_prog) return -EINVAL; cpus_read_lock(); err = virtnet_set_queues(vi, queue_pairs); if (err) { cpus_read_unlock(); goto err; } virtnet_set_affinity(vi); cpus_read_unlock(); netif_set_real_num_tx_queues(dev, queue_pairs); netif_set_real_num_rx_queues(dev, queue_pairs); err: return err; } static void virtnet_stats_sprintf(u8 **p, const char *fmt, const char *noq_fmt, int num, int qid, const struct virtnet_stat_desc *desc) { int i; if (qid < 0) { for (i = 0; i < num; ++i) ethtool_sprintf(p, noq_fmt, desc[i].desc); } else { for (i = 0; i < num; ++i) ethtool_sprintf(p, fmt, qid, desc[i].desc); } } /* qid == -1: for rx/tx queue total field */ static void virtnet_get_stats_string(struct virtnet_info *vi, int type, int qid, u8 **data) { const struct virtnet_stat_desc *desc; const char *fmt, *noq_fmt; u8 *p = *data; u32 num; if (type == VIRTNET_Q_TYPE_CQ && qid >= 0) { noq_fmt = "cq_hw_%s"; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_CVQ) { desc = &virtnet_stats_cvq_desc[0]; num = ARRAY_SIZE(virtnet_stats_cvq_desc); virtnet_stats_sprintf(&p, NULL, noq_fmt, num, -1, desc); } } if (type == VIRTNET_Q_TYPE_RX) { fmt = "rx%u_%s"; noq_fmt = "rx_%s"; desc = &virtnet_rq_stats_desc[0]; num = ARRAY_SIZE(virtnet_rq_stats_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); fmt = "rx%u_hw_%s"; noq_fmt = "rx_hw_%s"; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { desc = &virtnet_stats_rx_basic_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_basic_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { desc = &virtnet_stats_rx_csum_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_csum_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { desc = &virtnet_stats_rx_speed_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_speed_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } } if (type == VIRTNET_Q_TYPE_TX) { fmt = "tx%u_%s"; noq_fmt = "tx_%s"; desc = &virtnet_sq_stats_desc[0]; num = ARRAY_SIZE(virtnet_sq_stats_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); fmt = "tx%u_hw_%s"; noq_fmt = "tx_hw_%s"; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { desc = &virtnet_stats_tx_basic_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_basic_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { desc = &virtnet_stats_tx_gso_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_gso_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { desc = &virtnet_stats_tx_speed_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_speed_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } } *data = p; } struct virtnet_stats_ctx { /* The stats are write to qstats or ethtool -S */ bool to_qstat; /* Used to calculate the offset inside the output buffer. */ u32 desc_num[3]; /* The actual supported stat types. */ u64 bitmap[3]; /* Used to calculate the reply buffer size. */ u32 size[3]; /* Record the output buffer. */ u64 *data; }; static void virtnet_stats_ctx_init(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx, u64 *data, bool to_qstat) { u32 queue_type; ctx->data = data; ctx->to_qstat = to_qstat; if (to_qstat) { ctx->desc_num[VIRTNET_Q_TYPE_RX] = ARRAY_SIZE(virtnet_rq_stats_desc_qstat); ctx->desc_num[VIRTNET_Q_TYPE_TX] = ARRAY_SIZE(virtnet_sq_stats_desc_qstat); queue_type = VIRTNET_Q_TYPE_RX; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_BASIC; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_basic_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_basic); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_CSUM; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_csum_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_csum); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_GSO) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_GSO; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_gso_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_gso); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_SPEED; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_speed_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_speed); } queue_type = VIRTNET_Q_TYPE_TX; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_BASIC; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_basic_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_basic); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_CSUM) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_CSUM; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_csum_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_csum); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_GSO; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_gso_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_gso); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_SPEED; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_speed_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_speed); } return; } ctx->desc_num[VIRTNET_Q_TYPE_RX] = ARRAY_SIZE(virtnet_rq_stats_desc); ctx->desc_num[VIRTNET_Q_TYPE_TX] = ARRAY_SIZE(virtnet_sq_stats_desc); if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_CVQ) { queue_type = VIRTNET_Q_TYPE_CQ; ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_CVQ; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_cvq_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_cvq); } queue_type = VIRTNET_Q_TYPE_RX; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_BASIC; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_basic_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_basic); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_CSUM; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_csum_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_csum); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_SPEED; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_speed_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_speed); } queue_type = VIRTNET_Q_TYPE_TX; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_BASIC; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_basic_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_basic); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_GSO; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_gso_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_gso); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_SPEED; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_speed_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_speed); } } /* stats_sum_queue - Calculate the sum of the same fields in sq or rq. * @sum: the position to store the sum values * @num: field num * @q_value: the first queue fields * @q_num: number of the queues */ static void stats_sum_queue(u64 *sum, u32 num, u64 *q_value, u32 q_num) { u32 step = num; int i, j; u64 *p; for (i = 0; i < num; ++i) { p = sum + i; *p = 0; for (j = 0; j < q_num; ++j) *p += *(q_value + i + j * step); } } static void virtnet_fill_total_fields(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx) { u64 *data, *first_rx_q, *first_tx_q; u32 num_cq, num_rx, num_tx; num_cq = ctx->desc_num[VIRTNET_Q_TYPE_CQ]; num_rx = ctx->desc_num[VIRTNET_Q_TYPE_RX]; num_tx = ctx->desc_num[VIRTNET_Q_TYPE_TX]; first_rx_q = ctx->data + num_rx + num_tx + num_cq; first_tx_q = first_rx_q + vi->curr_queue_pairs * num_rx; data = ctx->data; stats_sum_queue(data, num_rx, first_rx_q, vi->curr_queue_pairs); data = ctx->data + num_rx; stats_sum_queue(data, num_tx, first_tx_q, vi->curr_queue_pairs); } static void virtnet_fill_stats_qstat(struct virtnet_info *vi, u32 qid, struct virtnet_stats_ctx *ctx, const u8 *base, bool drv_stats, u8 reply_type) { const struct virtnet_stat_desc *desc; const u64_stats_t *v_stat; u64 offset, bitmap; const __le64 *v; u32 queue_type; int i, num; queue_type = vq_type(vi, qid); bitmap = ctx->bitmap[queue_type]; if (drv_stats) { if (queue_type == VIRTNET_Q_TYPE_RX) { desc = &virtnet_rq_stats_desc_qstat[0]; num = ARRAY_SIZE(virtnet_rq_stats_desc_qstat); } else { desc = &virtnet_sq_stats_desc_qstat[0]; num = ARRAY_SIZE(virtnet_sq_stats_desc_qstat); } for (i = 0; i < num; ++i) { offset = desc[i].qstat_offset / sizeof(*ctx->data); v_stat = (const u64_stats_t *)(base + desc[i].offset); ctx->data[offset] = u64_stats_read(v_stat); } return; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { desc = &virtnet_stats_rx_basic_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_rx_basic_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_BASIC) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { desc = &virtnet_stats_rx_csum_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_rx_csum_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_CSUM) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_GSO) { desc = &virtnet_stats_rx_gso_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_rx_gso_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_GSO) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { desc = &virtnet_stats_rx_speed_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_rx_speed_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_SPEED) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { desc = &virtnet_stats_tx_basic_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_tx_basic_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_BASIC) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_CSUM) { desc = &virtnet_stats_tx_csum_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_tx_csum_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_CSUM) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_GSO) { desc = &virtnet_stats_tx_gso_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_tx_gso_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_GSO) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { desc = &virtnet_stats_tx_speed_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_tx_speed_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_SPEED) goto found; } return; found: for (i = 0; i < num; ++i) { offset = desc[i].qstat_offset / sizeof(*ctx->data); v = (const __le64 *)(base + desc[i].offset); ctx->data[offset] = le64_to_cpu(*v); } } /* virtnet_fill_stats - copy the stats to qstats or ethtool -S * The stats source is the device or the driver. * * @vi: virtio net info * @qid: the vq id * @ctx: stats ctx (initiated by virtnet_stats_ctx_init()) * @base: pointer to the device reply or the driver stats structure. * @drv_stats: designate the base type (device reply, driver stats) * @type: the type of the device reply (if drv_stats is true, this must be zero) */ static void virtnet_fill_stats(struct virtnet_info *vi, u32 qid, struct virtnet_stats_ctx *ctx, const u8 *base, bool drv_stats, u8 reply_type) { u32 queue_type, num_rx, num_tx, num_cq; const struct virtnet_stat_desc *desc; const u64_stats_t *v_stat; u64 offset, bitmap; const __le64 *v; int i, num; if (ctx->to_qstat) return virtnet_fill_stats_qstat(vi, qid, ctx, base, drv_stats, reply_type); num_cq = ctx->desc_num[VIRTNET_Q_TYPE_CQ]; num_rx = ctx->desc_num[VIRTNET_Q_TYPE_RX]; num_tx = ctx->desc_num[VIRTNET_Q_TYPE_TX]; queue_type = vq_type(vi, qid); bitmap = ctx->bitmap[queue_type]; /* skip the total fields of pairs */ offset = num_rx + num_tx; if (queue_type == VIRTNET_Q_TYPE_TX) { offset += num_cq + num_rx * vi->curr_queue_pairs + num_tx * (qid / 2); num = ARRAY_SIZE(virtnet_sq_stats_desc); if (drv_stats) { desc = &virtnet_sq_stats_desc[0]; goto drv_stats; } offset += num; } else if (queue_type == VIRTNET_Q_TYPE_RX) { offset += num_cq + num_rx * (qid / 2); num = ARRAY_SIZE(virtnet_rq_stats_desc); if (drv_stats) { desc = &virtnet_rq_stats_desc[0]; goto drv_stats; } offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_CVQ) { desc = &virtnet_stats_cvq_desc[0]; num = ARRAY_SIZE(virtnet_stats_cvq_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_CVQ) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { desc = &virtnet_stats_rx_basic_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_basic_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_BASIC) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { desc = &virtnet_stats_rx_csum_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_csum_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_CSUM) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { desc = &virtnet_stats_rx_speed_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_speed_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_SPEED) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { desc = &virtnet_stats_tx_basic_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_basic_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_BASIC) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_GSO) { desc = &virtnet_stats_tx_gso_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_gso_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_GSO) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { desc = &virtnet_stats_tx_speed_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_speed_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_SPEED) goto found; offset += num; } return; found: for (i = 0; i < num; ++i) { v = (const __le64 *)(base + desc[i].offset); ctx->data[offset + i] = le64_to_cpu(*v); } return; drv_stats: for (i = 0; i < num; ++i) { v_stat = (const u64_stats_t *)(base + desc[i].offset); ctx->data[offset + i] = u64_stats_read(v_stat); } } static int __virtnet_get_hw_stats(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx, struct virtio_net_ctrl_queue_stats *req, int req_size, void *reply, int res_size) { struct virtio_net_stats_reply_hdr *hdr; struct scatterlist sgs_in, sgs_out; void *p; u32 qid; int ok; sg_init_one(&sgs_out, req, req_size); sg_init_one(&sgs_in, reply, res_size); ok = virtnet_send_command_reply(vi, VIRTIO_NET_CTRL_STATS, VIRTIO_NET_CTRL_STATS_GET, &sgs_out, &sgs_in); if (!ok) return ok; for (p = reply; p - reply < res_size; p += le16_to_cpu(hdr->size)) { hdr = p; qid = le16_to_cpu(hdr->vq_index); virtnet_fill_stats(vi, qid, ctx, p, false, hdr->type); } return 0; } static void virtnet_make_stat_req(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx, struct virtio_net_ctrl_queue_stats *req, int qid, int *idx) { int qtype = vq_type(vi, qid); u64 bitmap = ctx->bitmap[qtype]; if (!bitmap) return; req->stats[*idx].vq_index = cpu_to_le16(qid); req->stats[*idx].types_bitmap[0] = cpu_to_le64(bitmap); *idx += 1; } /* qid: -1: get stats of all vq. * > 0: get the stats for the special vq. This must not be cvq. */ static int virtnet_get_hw_stats(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx, int qid) { int qnum, i, j, res_size, qtype, last_vq, first_vq; struct virtio_net_ctrl_queue_stats *req; bool enable_cvq; void *reply; int ok; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_DEVICE_STATS)) return 0; if (qid == -1) { last_vq = vi->curr_queue_pairs * 2 - 1; first_vq = 0; enable_cvq = true; } else { last_vq = qid; first_vq = qid; enable_cvq = false; } qnum = 0; res_size = 0; for (i = first_vq; i <= last_vq ; ++i) { qtype = vq_type(vi, i); if (ctx->bitmap[qtype]) { ++qnum; res_size += ctx->size[qtype]; } } if (enable_cvq && ctx->bitmap[VIRTNET_Q_TYPE_CQ]) { res_size += ctx->size[VIRTNET_Q_TYPE_CQ]; qnum += 1; } req = kcalloc(qnum, sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; reply = kmalloc(res_size, GFP_KERNEL); if (!reply) { kfree(req); return -ENOMEM; } j = 0; for (i = first_vq; i <= last_vq ; ++i) virtnet_make_stat_req(vi, ctx, req, i, &j); if (enable_cvq) virtnet_make_stat_req(vi, ctx, req, vi->max_queue_pairs * 2, &j); ok = __virtnet_get_hw_stats(vi, ctx, req, sizeof(*req) * j, reply, res_size); kfree(req); kfree(reply); return ok; } static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) { struct virtnet_info *vi = netdev_priv(dev); unsigned int i; u8 *p = data; switch (stringset) { case ETH_SS_STATS: /* Generate the total field names. */ virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_RX, -1, &p); virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_TX, -1, &p); virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_CQ, 0, &p); for (i = 0; i < vi->curr_queue_pairs; ++i) virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_RX, i, &p); for (i = 0; i < vi->curr_queue_pairs; ++i) virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_TX, i, &p); break; } } static int virtnet_get_sset_count(struct net_device *dev, int sset) { struct virtnet_info *vi = netdev_priv(dev); struct virtnet_stats_ctx ctx = {0}; u32 pair_count; switch (sset) { case ETH_SS_STATS: virtnet_stats_ctx_init(vi, &ctx, NULL, false); pair_count = ctx.desc_num[VIRTNET_Q_TYPE_RX] + ctx.desc_num[VIRTNET_Q_TYPE_TX]; return pair_count + ctx.desc_num[VIRTNET_Q_TYPE_CQ] + vi->curr_queue_pairs * pair_count; default: return -EOPNOTSUPP; } } static void virtnet_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct virtnet_info *vi = netdev_priv(dev); struct virtnet_stats_ctx ctx = {0}; unsigned int start, i; const u8 *stats_base; virtnet_stats_ctx_init(vi, &ctx, data, false); if (virtnet_get_hw_stats(vi, &ctx, -1)) dev_warn(&vi->dev->dev, "Failed to get hw stats.\n"); for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; struct send_queue *sq = &vi->sq[i]; stats_base = (const u8 *)&rq->stats; do { start = u64_stats_fetch_begin(&rq->stats.syncp); virtnet_fill_stats(vi, i * 2, &ctx, stats_base, true, 0); } while (u64_stats_fetch_retry(&rq->stats.syncp, start)); stats_base = (const u8 *)&sq->stats; do { start = u64_stats_fetch_begin(&sq->stats.syncp); virtnet_fill_stats(vi, i * 2 + 1, &ctx, stats_base, true, 0); } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); } virtnet_fill_total_fields(vi, &ctx); } static void virtnet_get_channels(struct net_device *dev, struct ethtool_channels *channels) { struct virtnet_info *vi = netdev_priv(dev); channels->combined_count = vi->curr_queue_pairs; channels->max_combined = vi->max_queue_pairs; channels->max_other = 0; channels->rx_count = 0; channels->tx_count = 0; channels->other_count = 0; } static int virtnet_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct virtnet_info *vi = netdev_priv(dev); return ethtool_virtdev_set_link_ksettings(dev, cmd, &vi->speed, &vi->duplex); } static int virtnet_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct virtnet_info *vi = netdev_priv(dev); cmd->base.speed = vi->speed; cmd->base.duplex = vi->duplex; cmd->base.port = PORT_OTHER; return 0; } static int virtnet_send_tx_notf_coal_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec) { struct virtio_net_ctrl_coal_tx *coal_tx __free(kfree) = NULL; struct scatterlist sgs_tx; int i; coal_tx = kzalloc(sizeof(*coal_tx), GFP_KERNEL); if (!coal_tx) return -ENOMEM; coal_tx->tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs); coal_tx->tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames); sg_init_one(&sgs_tx, coal_tx, sizeof(*coal_tx)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, VIRTIO_NET_CTRL_NOTF_COAL_TX_SET, &sgs_tx)) return -EINVAL; vi->intr_coal_tx.max_usecs = ec->tx_coalesce_usecs; vi->intr_coal_tx.max_packets = ec->tx_max_coalesced_frames; for (i = 0; i < vi->max_queue_pairs; i++) { vi->sq[i].intr_coal.max_usecs = ec->tx_coalesce_usecs; vi->sq[i].intr_coal.max_packets = ec->tx_max_coalesced_frames; } return 0; } static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec) { struct virtio_net_ctrl_coal_rx *coal_rx __free(kfree) = NULL; bool rx_ctrl_dim_on = !!ec->use_adaptive_rx_coalesce; struct scatterlist sgs_rx; int i; if (rx_ctrl_dim_on && !virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) return -EOPNOTSUPP; if (rx_ctrl_dim_on && (ec->rx_coalesce_usecs != vi->intr_coal_rx.max_usecs || ec->rx_max_coalesced_frames != vi->intr_coal_rx.max_packets)) return -EINVAL; if (rx_ctrl_dim_on && !vi->rx_dim_enabled) { vi->rx_dim_enabled = true; for (i = 0; i < vi->max_queue_pairs; i++) { mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].dim_enabled = true; mutex_unlock(&vi->rq[i].dim_lock); } return 0; } coal_rx = kzalloc(sizeof(*coal_rx), GFP_KERNEL); if (!coal_rx) return -ENOMEM; if (!rx_ctrl_dim_on && vi->rx_dim_enabled) { vi->rx_dim_enabled = false; for (i = 0; i < vi->max_queue_pairs; i++) { mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].dim_enabled = false; mutex_unlock(&vi->rq[i].dim_lock); } } /* Since the per-queue coalescing params can be set, * we need apply the global new params even if they * are not updated. */ coal_rx->rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs); coal_rx->rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames); sg_init_one(&sgs_rx, coal_rx, sizeof(*coal_rx)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, VIRTIO_NET_CTRL_NOTF_COAL_RX_SET, &sgs_rx)) return -EINVAL; vi->intr_coal_rx.max_usecs = ec->rx_coalesce_usecs; vi->intr_coal_rx.max_packets = ec->rx_max_coalesced_frames; for (i = 0; i < vi->max_queue_pairs; i++) { mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].intr_coal.max_usecs = ec->rx_coalesce_usecs; vi->rq[i].intr_coal.max_packets = ec->rx_max_coalesced_frames; mutex_unlock(&vi->rq[i].dim_lock); } return 0; } static int virtnet_send_notf_coal_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec) { int err; err = virtnet_send_tx_notf_coal_cmds(vi, ec); if (err) return err; err = virtnet_send_rx_notf_coal_cmds(vi, ec); if (err) return err; return 0; } static int virtnet_send_rx_notf_coal_vq_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec, u16 queue) { bool rx_ctrl_dim_on = !!ec->use_adaptive_rx_coalesce; u32 max_usecs, max_packets; bool cur_rx_dim; int err; mutex_lock(&vi->rq[queue].dim_lock); cur_rx_dim = vi->rq[queue].dim_enabled; max_usecs = vi->rq[queue].intr_coal.max_usecs; max_packets = vi->rq[queue].intr_coal.max_packets; if (rx_ctrl_dim_on && (ec->rx_coalesce_usecs != max_usecs || ec->rx_max_coalesced_frames != max_packets)) { mutex_unlock(&vi->rq[queue].dim_lock); return -EINVAL; } if (rx_ctrl_dim_on && !cur_rx_dim) { vi->rq[queue].dim_enabled = true; mutex_unlock(&vi->rq[queue].dim_lock); return 0; } if (!rx_ctrl_dim_on && cur_rx_dim) vi->rq[queue].dim_enabled = false; /* If no params are updated, userspace ethtool will * reject the modification. */ err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, queue, ec->rx_coalesce_usecs, ec->rx_max_coalesced_frames); mutex_unlock(&vi->rq[queue].dim_lock); return err; } static int virtnet_send_notf_coal_vq_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec, u16 queue) { int err; err = virtnet_send_rx_notf_coal_vq_cmds(vi, ec, queue); if (err) return err; err = virtnet_send_tx_ctrl_coal_vq_cmd(vi, queue, ec->tx_coalesce_usecs, ec->tx_max_coalesced_frames); if (err) return err; return 0; } static void virtnet_rx_dim_work(struct work_struct *work) { struct dim *dim = container_of(work, struct dim, work); struct receive_queue *rq = container_of(dim, struct receive_queue, dim); struct virtnet_info *vi = rq->vq->vdev->priv; struct net_device *dev = vi->dev; struct dim_cq_moder update_moder; int qnum, err; qnum = rq - vi->rq; mutex_lock(&rq->dim_lock); if (!rq->dim_enabled) goto out; update_moder = net_dim_get_rx_irq_moder(dev, dim); if (update_moder.usec != rq->intr_coal.max_usecs || update_moder.pkts != rq->intr_coal.max_packets) { err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, qnum, update_moder.usec, update_moder.pkts); if (err) pr_debug("%s: Failed to send dim parameters on rxq%d\n", dev->name, qnum); } out: dim->state = DIM_START_MEASURE; mutex_unlock(&rq->dim_lock); } static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) { /* usecs coalescing is supported only if VIRTIO_NET_F_NOTF_COAL * or VIRTIO_NET_F_VQ_NOTF_COAL feature is negotiated. */ if (ec->rx_coalesce_usecs || ec->tx_coalesce_usecs) return -EOPNOTSUPP; if (ec->tx_max_coalesced_frames > 1 || ec->rx_max_coalesced_frames != 1) return -EINVAL; return 0; } static int virtnet_should_update_vq_weight(int dev_flags, int weight, int vq_weight, bool *should_update) { if (weight ^ vq_weight) { if (dev_flags & IFF_UP) return -EBUSY; *should_update = true; } return 0; } static int virtnet_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); int ret, queue_number, napi_weight, i; bool update_napi = false; /* Can't change NAPI weight if the link is up */ napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) { ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, vi->sq[queue_number].napi.weight, &update_napi); if (ret) return ret; if (update_napi) { /* All queues that belong to [queue_number, vi->max_queue_pairs] will be * updated for the sake of simplicity, which might not be necessary */ break; } } if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) ret = virtnet_send_notf_coal_cmds(vi, ec); else ret = virtnet_coal_params_supported(ec); if (ret) return ret; if (update_napi) { /* xsk xmit depends on the tx napi. So if xsk is active, * prevent modifications to tx napi. */ for (i = queue_number; i < vi->max_queue_pairs; i++) { if (vi->sq[i].xsk_pool) return -EBUSY; } for (; queue_number < vi->max_queue_pairs; queue_number++) vi->sq[queue_number].napi.weight = napi_weight; } return ret; } static int virtnet_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { ec->rx_coalesce_usecs = vi->intr_coal_rx.max_usecs; ec->tx_coalesce_usecs = vi->intr_coal_tx.max_usecs; ec->tx_max_coalesced_frames = vi->intr_coal_tx.max_packets; ec->rx_max_coalesced_frames = vi->intr_coal_rx.max_packets; ec->use_adaptive_rx_coalesce = vi->rx_dim_enabled; } else { ec->rx_max_coalesced_frames = 1; if (vi->sq[0].napi.weight) ec->tx_max_coalesced_frames = 1; } return 0; } static int virtnet_set_per_queue_coalesce(struct net_device *dev, u32 queue, struct ethtool_coalesce *ec) { struct virtnet_info *vi = netdev_priv(dev); int ret, napi_weight; bool update_napi = false; if (queue >= vi->max_queue_pairs) return -EINVAL; /* Can't change NAPI weight if the link is up */ napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, vi->sq[queue].napi.weight, &update_napi); if (ret) return ret; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue); else ret = virtnet_coal_params_supported(ec); if (ret) return ret; if (update_napi) vi->sq[queue].napi.weight = napi_weight; return 0; } static int virtnet_get_per_queue_coalesce(struct net_device *dev, u32 queue, struct ethtool_coalesce *ec) { struct virtnet_info *vi = netdev_priv(dev); if (queue >= vi->max_queue_pairs) return -EINVAL; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) { mutex_lock(&vi->rq[queue].dim_lock); ec->rx_coalesce_usecs = vi->rq[queue].intr_coal.max_usecs; ec->tx_coalesce_usecs = vi->sq[queue].intr_coal.max_usecs; ec->tx_max_coalesced_frames = vi->sq[queue].intr_coal.max_packets; ec->rx_max_coalesced_frames = vi->rq[queue].intr_coal.max_packets; ec->use_adaptive_rx_coalesce = vi->rq[queue].dim_enabled; mutex_unlock(&vi->rq[queue].dim_lock); } else { ec->rx_max_coalesced_frames = 1; if (vi->sq[queue].napi.weight) ec->tx_max_coalesced_frames = 1; } return 0; } static void virtnet_init_settings(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); vi->speed = SPEED_UNKNOWN; vi->duplex = DUPLEX_UNKNOWN; } static u32 virtnet_get_rxfh_key_size(struct net_device *dev) { return ((struct virtnet_info *)netdev_priv(dev))->rss_key_size; } static u32 virtnet_get_rxfh_indir_size(struct net_device *dev) { return ((struct virtnet_info *)netdev_priv(dev))->rss_indir_table_size; } static int virtnet_get_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh) { struct virtnet_info *vi = netdev_priv(dev); int i; if (rxfh->indir) { for (i = 0; i < vi->rss_indir_table_size; ++i) rxfh->indir[i] = le16_to_cpu(vi->rss_hdr->indirection_table[i]); } if (rxfh->key) memcpy(rxfh->key, vi->rss_hash_key_data, vi->rss_key_size); rxfh->hfunc = ETH_RSS_HASH_TOP; return 0; } static int virtnet_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); bool update = false; int i; if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE && rxfh->hfunc != ETH_RSS_HASH_TOP) return -EOPNOTSUPP; if (rxfh->indir) { if (!vi->has_rss) return -EOPNOTSUPP; for (i = 0; i < vi->rss_indir_table_size; ++i) vi->rss_hdr->indirection_table[i] = cpu_to_le16(rxfh->indir[i]); update = true; } if (rxfh->key) { /* If either _F_HASH_REPORT or _F_RSS are negotiated, the * device provides hash calculation capabilities, that is, * hash_key is configured. */ if (!vi->has_rss && !vi->has_rss_hash_report) return -EOPNOTSUPP; memcpy(vi->rss_hash_key_data, rxfh->key, vi->rss_key_size); update = true; } if (update) virtnet_commit_rss_command(vi); return 0; } static int virtnet_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, u32 *rule_locs) { struct virtnet_info *vi = netdev_priv(dev); int rc = 0; switch (info->cmd) { case ETHTOOL_GRXRINGS: info->data = vi->curr_queue_pairs; break; case ETHTOOL_GRXFH: virtnet_get_hashflow(vi, info); break; default: rc = -EOPNOTSUPP; } return rc; } static int virtnet_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info) { struct virtnet_info *vi = netdev_priv(dev); int rc = 0; switch (info->cmd) { case ETHTOOL_SRXFH: if (!virtnet_set_hashflow(vi, info)) rc = -EINVAL; break; default: rc = -EOPNOTSUPP; } return rc; } static const struct ethtool_ops virtnet_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES | ETHTOOL_COALESCE_USECS | ETHTOOL_COALESCE_USE_ADAPTIVE_RX, .get_drvinfo = virtnet_get_drvinfo, .get_link = ethtool_op_get_link, .get_ringparam = virtnet_get_ringparam, .set_ringparam = virtnet_set_ringparam, .get_strings = virtnet_get_strings, .get_sset_count = virtnet_get_sset_count, .get_ethtool_stats = virtnet_get_ethtool_stats, .set_channels = virtnet_set_channels, .get_channels = virtnet_get_channels, .get_ts_info = ethtool_op_get_ts_info, .get_link_ksettings = virtnet_get_link_ksettings, .set_link_ksettings = virtnet_set_link_ksettings, .set_coalesce = virtnet_set_coalesce, .get_coalesce = virtnet_get_coalesce, .set_per_queue_coalesce = virtnet_set_per_queue_coalesce, .get_per_queue_coalesce = virtnet_get_per_queue_coalesce, .get_rxfh_key_size = virtnet_get_rxfh_key_size, .get_rxfh_indir_size = virtnet_get_rxfh_indir_size, .get_rxfh = virtnet_get_rxfh, .set_rxfh = virtnet_set_rxfh, .get_rxnfc = virtnet_get_rxnfc, .set_rxnfc = virtnet_set_rxnfc, }; static void virtnet_get_queue_stats_rx(struct net_device *dev, int i, struct netdev_queue_stats_rx *stats) { struct virtnet_info *vi = netdev_priv(dev); struct receive_queue *rq = &vi->rq[i]; struct virtnet_stats_ctx ctx = {0}; virtnet_stats_ctx_init(vi, &ctx, (void *)stats, true); virtnet_get_hw_stats(vi, &ctx, i * 2); virtnet_fill_stats(vi, i * 2, &ctx, (void *)&rq->stats, true, 0); } static void virtnet_get_queue_stats_tx(struct net_device *dev, int i, struct netdev_queue_stats_tx *stats) { struct virtnet_info *vi = netdev_priv(dev); struct send_queue *sq = &vi->sq[i]; struct virtnet_stats_ctx ctx = {0}; virtnet_stats_ctx_init(vi, &ctx, (void *)stats, true); virtnet_get_hw_stats(vi, &ctx, i * 2 + 1); virtnet_fill_stats(vi, i * 2 + 1, &ctx, (void *)&sq->stats, true, 0); } static void virtnet_get_base_stats(struct net_device *dev, struct netdev_queue_stats_rx *rx, struct netdev_queue_stats_tx *tx) { struct virtnet_info *vi = netdev_priv(dev); /* The queue stats of the virtio-net will not be reset. So here we * return 0. */ rx->bytes = 0; rx->packets = 0; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { rx->hw_drops = 0; rx->hw_drop_overruns = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { rx->csum_unnecessary = 0; rx->csum_none = 0; rx->csum_bad = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_GSO) { rx->hw_gro_packets = 0; rx->hw_gro_bytes = 0; rx->hw_gro_wire_packets = 0; rx->hw_gro_wire_bytes = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) rx->hw_drop_ratelimits = 0; tx->bytes = 0; tx->packets = 0; tx->stop = 0; tx->wake = 0; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { tx->hw_drops = 0; tx->hw_drop_errors = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_CSUM) { tx->csum_none = 0; tx->needs_csum = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { tx->hw_gso_packets = 0; tx->hw_gso_bytes = 0; tx->hw_gso_wire_packets = 0; tx->hw_gso_wire_bytes = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) tx->hw_drop_ratelimits = 0; } static const struct netdev_stat_ops virtnet_stat_ops = { .get_queue_stats_rx = virtnet_get_queue_stats_rx, .get_queue_stats_tx = virtnet_get_queue_stats_tx, .get_base_stats = virtnet_get_base_stats, }; static void virtnet_freeze_down(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; /* Make sure no work handler is accessing the device */ flush_work(&vi->config_work); disable_rx_mode_work(vi); flush_work(&vi->rx_mode_work); netif_tx_lock_bh(vi->dev); netif_device_detach(vi->dev); netif_tx_unlock_bh(vi->dev); if (netif_running(vi->dev)) { rtnl_lock(); virtnet_close(vi->dev); rtnl_unlock(); } } static int init_vqs(struct virtnet_info *vi); static int virtnet_restore_up(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; int err; err = init_vqs(vi); if (err) return err; virtio_device_ready(vdev); enable_delayed_refill(vi); enable_rx_mode_work(vi); if (netif_running(vi->dev)) { rtnl_lock(); err = virtnet_open(vi->dev); rtnl_unlock(); if (err) return err; } netif_tx_lock_bh(vi->dev); netif_device_attach(vi->dev); netif_tx_unlock_bh(vi->dev); return err; } static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads) { __virtio64 *_offloads __free(kfree) = NULL; struct scatterlist sg; _offloads = kzalloc(sizeof(*_offloads), GFP_KERNEL); if (!_offloads) return -ENOMEM; *_offloads = cpu_to_virtio64(vi->vdev, offloads); sg_init_one(&sg, _offloads, sizeof(*_offloads)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS, VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) { dev_warn(&vi->dev->dev, "Fail to set guest offload.\n"); return -EINVAL; } return 0; } static int virtnet_clear_guest_offloads(struct virtnet_info *vi) { u64 offloads = 0; if (!vi->guest_offloads) return 0; return virtnet_set_guest_offloads(vi, offloads); } static int virtnet_restore_guest_offloads(struct virtnet_info *vi) { u64 offloads = vi->guest_offloads; if (!vi->guest_offloads) return 0; return virtnet_set_guest_offloads(vi, offloads); } static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq, struct xsk_buff_pool *pool) { int err, qindex; qindex = rq - vi->rq; if (pool) { err = xdp_rxq_info_reg(&rq->xsk_rxq_info, vi->dev, qindex, rq->napi.napi_id); if (err < 0) return err; err = xdp_rxq_info_reg_mem_model(&rq->xsk_rxq_info, MEM_TYPE_XSK_BUFF_POOL, NULL); if (err < 0) goto unreg; xsk_pool_set_rxq_info(pool, &rq->xsk_rxq_info); } virtnet_rx_pause(vi, rq); err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf, NULL); if (err) { netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); pool = NULL; } rq->xsk_pool = pool; virtnet_rx_resume(vi, rq); if (pool) return 0; unreg: xdp_rxq_info_unreg(&rq->xsk_rxq_info); return err; } static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, struct send_queue *sq, struct xsk_buff_pool *pool) { int err, qindex; qindex = sq - vi->sq; virtnet_tx_pause(vi, sq); err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf, virtnet_sq_free_unused_buf_done); if (err) { netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); pool = NULL; } sq->xsk_pool = pool; virtnet_tx_resume(vi, sq); return err; } static int virtnet_xsk_pool_enable(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid) { struct virtnet_info *vi = netdev_priv(dev); struct receive_queue *rq; struct device *dma_dev; struct send_queue *sq; dma_addr_t hdr_dma; int err, size; if (vi->hdr_len > xsk_pool_get_headroom(pool)) return -EINVAL; /* In big_packets mode, xdp cannot work, so there is no need to * initialize xsk of rq. */ if (vi->big_packets && !vi->mergeable_rx_bufs) return -ENOENT; if (qid >= vi->curr_queue_pairs) return -EINVAL; sq = &vi->sq[qid]; rq = &vi->rq[qid]; /* xsk assumes that tx and rx must have the same dma device. The af-xdp * may use one buffer to receive from the rx and reuse this buffer to * send by the tx. So the dma dev of sq and rq must be the same one. * * But vq->dma_dev allows every vq has the respective dma dev. So I * check the dma dev of vq and sq is the same dev. */ if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq)) return -EINVAL; dma_dev = virtqueue_dma_dev(rq->vq); if (!dma_dev) return -EINVAL; size = virtqueue_get_vring_size(rq->vq); rq->xsk_buffs = kvcalloc(size, sizeof(*rq->xsk_buffs), GFP_KERNEL); if (!rq->xsk_buffs) return -ENOMEM; hdr_dma = virtqueue_dma_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE, 0); if (virtqueue_dma_mapping_error(sq->vq, hdr_dma)) return -ENOMEM; err = xsk_pool_dma_map(pool, dma_dev, 0); if (err) goto err_xsk_map; err = virtnet_rq_bind_xsk_pool(vi, rq, pool); if (err) goto err_rq; err = virtnet_sq_bind_xsk_pool(vi, sq, pool); if (err) goto err_sq; /* Now, we do not support tx offload(such as tx csum), so all the tx * virtnet hdr is zero. So all the tx packets can share a single hdr. */ sq->xsk_hdr_dma_addr = hdr_dma; return 0; err_sq: virtnet_rq_bind_xsk_pool(vi, rq, NULL); err_rq: xsk_pool_dma_unmap(pool, 0); err_xsk_map: virtqueue_dma_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, DMA_TO_DEVICE, 0); return err; } static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) { struct virtnet_info *vi = netdev_priv(dev); struct xsk_buff_pool *pool; struct receive_queue *rq; struct send_queue *sq; int err; if (qid >= vi->curr_queue_pairs) return -EINVAL; sq = &vi->sq[qid]; rq = &vi->rq[qid]; pool = rq->xsk_pool; err = virtnet_rq_bind_xsk_pool(vi, rq, NULL); err |= virtnet_sq_bind_xsk_pool(vi, sq, NULL); xsk_pool_dma_unmap(pool, 0); virtqueue_dma_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr, vi->hdr_len, DMA_TO_DEVICE, 0); kvfree(rq->xsk_buffs); return err; } static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp) { if (xdp->xsk.pool) return virtnet_xsk_pool_enable(dev, xdp->xsk.pool, xdp->xsk.queue_id); else return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id); } static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { unsigned int room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM + sizeof(struct skb_shared_info)); unsigned int max_sz = PAGE_SIZE - room - ETH_HLEN; struct virtnet_info *vi = netdev_priv(dev); struct bpf_prog *old_prog; u16 xdp_qp = 0, curr_qp; int i, err; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6))) { NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first"); return -EOPNOTSUPP; } if (vi->mergeable_rx_bufs && !vi->any_header_sg) { NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required"); return -EINVAL; } if (prog && !prog->aux->xdp_has_frags && dev->mtu > max_sz) { NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP without frags"); netdev_warn(dev, "single-buffer XDP requires MTU less than %u\n", max_sz); return -EINVAL; } curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs; if (prog) xdp_qp = nr_cpu_ids; /* XDP requires extra queues for XDP_TX */ if (curr_qp + xdp_qp > vi->max_queue_pairs) { netdev_warn_once(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n", curr_qp + xdp_qp, vi->max_queue_pairs); xdp_qp = 0; } old_prog = rtnl_dereference(vi->rq[0].xdp_prog); if (!prog && !old_prog) return 0; if (prog) bpf_prog_add(prog, vi->max_queue_pairs - 1); /* Make sure NAPI is not using any XDP TX queues for RX. */ if (netif_running(dev)) { for (i = 0; i < vi->max_queue_pairs; i++) { virtnet_napi_disable(&vi->rq[i]); virtnet_napi_tx_disable(&vi->sq[i]); } } if (!prog) { for (i = 0; i < vi->max_queue_pairs; i++) { rcu_assign_pointer(vi->rq[i].xdp_prog, prog); if (i == 0) virtnet_restore_guest_offloads(vi); } synchronize_net(); } err = virtnet_set_queues(vi, curr_qp + xdp_qp); if (err) goto err; netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp); vi->xdp_queue_pairs = xdp_qp; if (prog) { vi->xdp_enabled = true; for (i = 0; i < vi->max_queue_pairs; i++) { rcu_assign_pointer(vi->rq[i].xdp_prog, prog); if (i == 0 && !old_prog) virtnet_clear_guest_offloads(vi); } if (!old_prog) xdp_features_set_redirect_target(dev, true); } else { xdp_features_clear_redirect_target(dev); vi->xdp_enabled = false; } for (i = 0; i < vi->max_queue_pairs; i++) { if (old_prog) bpf_prog_put(old_prog); if (netif_running(dev)) { virtnet_napi_enable(&vi->rq[i]); virtnet_napi_tx_enable(&vi->sq[i]); } } return 0; err: if (!prog) { virtnet_clear_guest_offloads(vi); for (i = 0; i < vi->max_queue_pairs; i++) rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog); } if (netif_running(dev)) { for (i = 0; i < vi->max_queue_pairs; i++) { virtnet_napi_enable(&vi->rq[i]); virtnet_napi_tx_enable(&vi->sq[i]); } } if (prog) bpf_prog_sub(prog, vi->max_queue_pairs - 1); return err; } static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return virtnet_xdp_set(dev, xdp->prog, xdp->extack); case XDP_SETUP_XSK_POOL: return virtnet_xsk_pool_setup(dev, xdp); default: return -EINVAL; } } static int virtnet_get_phys_port_name(struct net_device *dev, char *buf, size_t len) { struct virtnet_info *vi = netdev_priv(dev); int ret; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY)) return -EOPNOTSUPP; ret = snprintf(buf, len, "sby"); if (ret >= len) return -EOPNOTSUPP; return 0; } static int virtnet_set_features(struct net_device *dev, netdev_features_t features) { struct virtnet_info *vi = netdev_priv(dev); u64 offloads; int err; if ((dev->features ^ features) & NETIF_F_GRO_HW) { if (vi->xdp_enabled) return -EBUSY; if (features & NETIF_F_GRO_HW) offloads = vi->guest_offloads_capable; else offloads = vi->guest_offloads_capable & ~GUEST_OFFLOAD_GRO_HW_MASK; err = virtnet_set_guest_offloads(vi, offloads); if (err) return err; vi->guest_offloads = offloads; } if ((dev->features ^ features) & NETIF_F_RXHASH) { if (features & NETIF_F_RXHASH) vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_saved); else vi->rss_hdr->hash_types = cpu_to_le32(VIRTIO_NET_HASH_REPORT_NONE); if (!virtnet_commit_rss_command(vi)) return -EINVAL; } return 0; } static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct virtnet_info *priv = netdev_priv(dev); struct send_queue *sq = &priv->sq[txqueue]; struct netdev_queue *txq = netdev_get_tx_queue(dev, txqueue); u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.tx_timeouts); u64_stats_update_end(&sq->stats.syncp); netdev_err(dev, "TX timeout on queue: %u, sq: %s, vq: 0x%x, name: %s, %u usecs ago\n", txqueue, sq->name, sq->vq->index, sq->vq->name, jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start))); } static int virtnet_init_irq_moder(struct virtnet_info *vi) { u8 profile_flags = 0, coal_flags = 0; int ret, i; profile_flags |= DIM_PROFILE_RX; coal_flags |= DIM_COALESCE_USEC | DIM_COALESCE_PKTS; ret = net_dim_init_irq_moder(vi->dev, profile_flags, coal_flags, DIM_CQ_PERIOD_MODE_START_FROM_EQE, 0, virtnet_rx_dim_work, NULL); if (ret) return ret; for (i = 0; i < vi->max_queue_pairs; i++) net_dim_setting(vi->dev, &vi->rq[i].dim, false); return 0; } static void virtnet_free_irq_moder(struct virtnet_info *vi) { if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) return; rtnl_lock(); net_dim_free_irq_moder(vi->dev); rtnl_unlock(); } static const struct net_device_ops virtnet_netdev = { .ndo_open = virtnet_open, .ndo_stop = virtnet_close, .ndo_start_xmit = start_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = virtnet_set_mac_address, .ndo_set_rx_mode = virtnet_set_rx_mode, .ndo_get_stats64 = virtnet_stats, .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid, .ndo_bpf = virtnet_xdp, .ndo_xdp_xmit = virtnet_xdp_xmit, .ndo_xsk_wakeup = virtnet_xsk_wakeup, .ndo_features_check = passthru_features_check, .ndo_get_phys_port_name = virtnet_get_phys_port_name, .ndo_set_features = virtnet_set_features, .ndo_tx_timeout = virtnet_tx_timeout, }; static void virtnet_config_changed_work(struct work_struct *work) { struct virtnet_info *vi = container_of(work, struct virtnet_info, config_work); u16 v; if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS, struct virtio_net_config, status, &v) < 0) return; if (v & VIRTIO_NET_S_ANNOUNCE) { netdev_notify_peers(vi->dev); virtnet_ack_link_announce(vi); } /* Ignore unknown (future) status bits */ v &= VIRTIO_NET_S_LINK_UP; if (vi->status == v) return; vi->status = v; if (vi->status & VIRTIO_NET_S_LINK_UP) { virtnet_update_settings(vi); netif_carrier_on(vi->dev); netif_tx_wake_all_queues(vi->dev); } else { netif_carrier_off(vi->dev); netif_tx_stop_all_queues(vi->dev); } } static void virtnet_config_changed(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; schedule_work(&vi->config_work); } static void virtnet_free_queues(struct virtnet_info *vi) { int i; for (i = 0; i < vi->max_queue_pairs; i++) { __netif_napi_del(&vi->rq[i].napi); __netif_napi_del(&vi->sq[i].napi); } /* We called __netif_napi_del(), * we need to respect an RCU grace period before freeing vi->rq */ synchronize_net(); kfree(vi->rq); kfree(vi->sq); kfree(vi->ctrl); } static void _free_receive_bufs(struct virtnet_info *vi) { struct bpf_prog *old_prog; int i; for (i = 0; i < vi->max_queue_pairs; i++) { while (vi->rq[i].pages) __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0); old_prog = rtnl_dereference(vi->rq[i].xdp_prog); RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL); if (old_prog) bpf_prog_put(old_prog); } } static void free_receive_bufs(struct virtnet_info *vi) { rtnl_lock(); _free_receive_bufs(vi); rtnl_unlock(); } static void free_receive_page_frags(struct virtnet_info *vi) { int i; for (i = 0; i < vi->max_queue_pairs; i++) if (vi->rq[i].alloc_frag.page) { if (vi->rq[i].last_dma) virtnet_rq_unmap(&vi->rq[i], vi->rq[i].last_dma, 0); put_page(vi->rq[i].alloc_frag.page); } } static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) { struct virtnet_info *vi = vq->vdev->priv; struct send_queue *sq; int i = vq2txq(vq); sq = &vi->sq[i]; switch (virtnet_xmit_ptr_unpack(&buf)) { case VIRTNET_XMIT_TYPE_SKB: case VIRTNET_XMIT_TYPE_SKB_ORPHAN: dev_kfree_skb(buf); break; case VIRTNET_XMIT_TYPE_XDP: xdp_return_frame(buf); break; case VIRTNET_XMIT_TYPE_XSK: xsk_tx_completed(sq->xsk_pool, 1); break; } } static void virtnet_sq_free_unused_buf_done(struct virtqueue *vq) { struct virtnet_info *vi = vq->vdev->priv; int i = vq2txq(vq); netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, i)); } static void free_unused_bufs(struct virtnet_info *vi) { void *buf; int i; for (i = 0; i < vi->max_queue_pairs; i++) { struct virtqueue *vq = vi->sq[i].vq; while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) virtnet_sq_free_unused_buf(vq, buf); cond_resched(); } for (i = 0; i < vi->max_queue_pairs; i++) { struct virtqueue *vq = vi->rq[i].vq; while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) virtnet_rq_unmap_free_buf(vq, buf); cond_resched(); } } static void virtnet_del_vqs(struct virtnet_info *vi) { struct virtio_device *vdev = vi->vdev; virtnet_clean_affinity(vi); vdev->config->del_vqs(vdev); virtnet_free_queues(vi); } /* How large should a single buffer be so a queue full of these can fit at * least one full packet? * Logic below assumes the mergeable buffer header is used. */ static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq) { const unsigned int hdr_len = vi->hdr_len; unsigned int rq_size = virtqueue_get_vring_size(vq); unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu; unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len; unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size); return max(max(min_buf_len, hdr_len) - hdr_len, (unsigned int)GOOD_PACKET_LEN); } static int virtnet_find_vqs(struct virtnet_info *vi) { struct virtqueue_info *vqs_info; struct virtqueue **vqs; int ret = -ENOMEM; int total_vqs; bool *ctx; u16 i; /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by * possible control vq. */ total_vqs = vi->max_queue_pairs * 2 + virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ); /* Allocate space for find_vqs parameters */ vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL); if (!vqs) goto err_vq; vqs_info = kcalloc(total_vqs, sizeof(*vqs_info), GFP_KERNEL); if (!vqs_info) goto err_vqs_info; if (!vi->big_packets || vi->mergeable_rx_bufs) { ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL); if (!ctx) goto err_ctx; } else { ctx = NULL; } /* Parameters for control virtqueue, if any */ if (vi->has_cvq) { vqs_info[total_vqs - 1].name = "control"; } /* Allocate/initialize parameters for send/receive virtqueues */ for (i = 0; i < vi->max_queue_pairs; i++) { vqs_info[rxq2vq(i)].callback = skb_recv_done; vqs_info[txq2vq(i)].callback = skb_xmit_done; sprintf(vi->rq[i].name, "input.%u", i); sprintf(vi->sq[i].name, "output.%u", i); vqs_info[rxq2vq(i)].name = vi->rq[i].name; vqs_info[txq2vq(i)].name = vi->sq[i].name; if (ctx) vqs_info[rxq2vq(i)].ctx = true; } ret = virtio_find_vqs(vi->vdev, total_vqs, vqs, vqs_info, NULL); if (ret) goto err_find; if (vi->has_cvq) { vi->cvq = vqs[total_vqs - 1]; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; } for (i = 0; i < vi->max_queue_pairs; i++) { vi->rq[i].vq = vqs[rxq2vq(i)]; vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); vi->sq[i].vq = vqs[txq2vq(i)]; } /* run here: ret == 0. */ err_find: kfree(ctx); err_ctx: kfree(vqs_info); err_vqs_info: kfree(vqs); err_vq: return ret; } static int virtnet_alloc_queues(struct virtnet_info *vi) { int i; if (vi->has_cvq) { vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL); if (!vi->ctrl) goto err_ctrl; } else { vi->ctrl = NULL; } vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL); if (!vi->sq) goto err_sq; vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL); if (!vi->rq) goto err_rq; INIT_DELAYED_WORK(&vi->refill, refill_work); for (i = 0; i < vi->max_queue_pairs; i++) { vi->rq[i].pages = NULL; netif_napi_add_config(vi->dev, &vi->rq[i].napi, virtnet_poll, i); vi->rq[i].napi.weight = napi_weight; netif_napi_add_tx_weight(vi->dev, &vi->sq[i].napi, virtnet_poll_tx, napi_tx ? napi_weight : 0); sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg)); u64_stats_init(&vi->rq[i].stats.syncp); u64_stats_init(&vi->sq[i].stats.syncp); mutex_init(&vi->rq[i].dim_lock); } return 0; err_rq: kfree(vi->sq); err_sq: kfree(vi->ctrl); err_ctrl: return -ENOMEM; } static int init_vqs(struct virtnet_info *vi) { int ret; /* Allocate send & receive queues */ ret = virtnet_alloc_queues(vi); if (ret) goto err; ret = virtnet_find_vqs(vi); if (ret) goto err_free; cpus_read_lock(); virtnet_set_affinity(vi); cpus_read_unlock(); return 0; err_free: virtnet_free_queues(vi); err: return ret; } #ifdef CONFIG_SYSFS static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue, char *buf) { struct virtnet_info *vi = netdev_priv(queue->dev); unsigned int queue_index = get_netdev_rx_queue_index(queue); unsigned int headroom = virtnet_get_headroom(vi); unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; struct ewma_pkt_len *avg; BUG_ON(queue_index >= vi->max_queue_pairs); avg = &vi->rq[queue_index].mrg_avg_pkt_len; return sprintf(buf, "%u\n", get_mergeable_buf_len(&vi->rq[queue_index], avg, SKB_DATA_ALIGN(headroom + tailroom))); } static struct rx_queue_attribute mergeable_rx_buffer_size_attribute = __ATTR_RO(mergeable_rx_buffer_size); static struct attribute *virtio_net_mrg_rx_attrs[] = { &mergeable_rx_buffer_size_attribute.attr, NULL }; static const struct attribute_group virtio_net_mrg_rx_group = { .name = "virtio_net", .attrs = virtio_net_mrg_rx_attrs }; #endif static bool virtnet_fail_on_feature(struct virtio_device *vdev, unsigned int fbit, const char *fname, const char *dname) { if (!virtio_has_feature(vdev, fbit)) return false; dev_err(&vdev->dev, "device advertises feature %s but not %s", fname, dname); return true; } #define VIRTNET_FAIL_ON(vdev, fbit, dbit) \ virtnet_fail_on_feature(vdev, fbit, #fbit, dbit) static bool virtnet_validate_features(struct virtio_device *vdev) { if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) && (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_RSS, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_HASH_REPORT, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_NOTF_COAL, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_VQ_NOTF_COAL, "VIRTIO_NET_F_CTRL_VQ"))) { return false; } return true; } #define MIN_MTU ETH_MIN_MTU #define MAX_MTU ETH_MAX_MTU static int virtnet_validate(struct virtio_device *vdev) { if (!vdev->config->get) { dev_err(&vdev->dev, "%s failure: config access disabled\n", __func__); return -EINVAL; } if (!virtnet_validate_features(vdev)) return -EINVAL; if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) { int mtu = virtio_cread16(vdev, offsetof(struct virtio_net_config, mtu)); if (mtu < MIN_MTU) __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU); } if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY) && !virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { dev_warn(&vdev->dev, "device advertises feature VIRTIO_NET_F_STANDBY but not VIRTIO_NET_F_MAC, disabling standby"); __virtio_clear_bit(vdev, VIRTIO_NET_F_STANDBY); } return 0; } static bool virtnet_check_guest_gso(const struct virtnet_info *vi) { return virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) || (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) && virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6)); } static void virtnet_set_big_packets(struct virtnet_info *vi, const int mtu) { bool guest_gso = virtnet_check_guest_gso(vi); /* If device can receive ANY guest GSO packets, regardless of mtu, * allocate packets of maximum size, otherwise limit it to only * mtu size worth only. */ if (mtu > ETH_DATA_LEN || guest_gso) { vi->big_packets = true; vi->big_packets_num_skbfrags = guest_gso ? MAX_SKB_FRAGS : DIV_ROUND_UP(mtu, PAGE_SIZE); } } #define VIRTIO_NET_HASH_REPORT_MAX_TABLE 10 static enum xdp_rss_hash_type virtnet_xdp_rss_type[VIRTIO_NET_HASH_REPORT_MAX_TABLE] = { [VIRTIO_NET_HASH_REPORT_NONE] = XDP_RSS_TYPE_NONE, [VIRTIO_NET_HASH_REPORT_IPv4] = XDP_RSS_TYPE_L3_IPV4, [VIRTIO_NET_HASH_REPORT_TCPv4] = XDP_RSS_TYPE_L4_IPV4_TCP, [VIRTIO_NET_HASH_REPORT_UDPv4] = XDP_RSS_TYPE_L4_IPV4_UDP, [VIRTIO_NET_HASH_REPORT_IPv6] = XDP_RSS_TYPE_L3_IPV6, [VIRTIO_NET_HASH_REPORT_TCPv6] = XDP_RSS_TYPE_L4_IPV6_TCP, [VIRTIO_NET_HASH_REPORT_UDPv6] = XDP_RSS_TYPE_L4_IPV6_UDP, [VIRTIO_NET_HASH_REPORT_IPv6_EX] = XDP_RSS_TYPE_L3_IPV6_EX, [VIRTIO_NET_HASH_REPORT_TCPv6_EX] = XDP_RSS_TYPE_L4_IPV6_TCP_EX, [VIRTIO_NET_HASH_REPORT_UDPv6_EX] = XDP_RSS_TYPE_L4_IPV6_UDP_EX }; static int virtnet_xdp_rx_hash(const struct xdp_md *_ctx, u32 *hash, enum xdp_rss_hash_type *rss_type) { const struct xdp_buff *xdp = (void *)_ctx; struct virtio_net_hdr_v1_hash *hdr_hash; struct virtnet_info *vi; u16 hash_report; if (!(xdp->rxq->dev->features & NETIF_F_RXHASH)) return -ENODATA; vi = netdev_priv(xdp->rxq->dev); hdr_hash = (struct virtio_net_hdr_v1_hash *)(xdp->data - vi->hdr_len); hash_report = __le16_to_cpu(hdr_hash->hash_report); if (hash_report >= VIRTIO_NET_HASH_REPORT_MAX_TABLE) hash_report = VIRTIO_NET_HASH_REPORT_NONE; *rss_type = virtnet_xdp_rss_type[hash_report]; *hash = __le32_to_cpu(hdr_hash->hash_value); return 0; } static const struct xdp_metadata_ops virtnet_xdp_metadata_ops = { .xmo_rx_hash = virtnet_xdp_rx_hash, }; static int virtnet_probe(struct virtio_device *vdev) { int i, err = -ENOMEM; struct net_device *dev; struct virtnet_info *vi; u16 max_queue_pairs; int mtu = 0; /* Find if host supports multiqueue/rss virtio_net device */ max_queue_pairs = 1; if (virtio_has_feature(vdev, VIRTIO_NET_F_MQ) || virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) max_queue_pairs = virtio_cread16(vdev, offsetof(struct virtio_net_config, max_virtqueue_pairs)); /* We need at least 2 queue's */ if (max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX || !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) max_queue_pairs = 1; /* Allocate ourselves a network device with room for our info */ dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs); if (!dev) return -ENOMEM; /* Set up network device as normal. */ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE | IFF_TX_SKB_NO_LINEAR; dev->netdev_ops = &virtnet_netdev; dev->stat_ops = &virtnet_stat_ops; dev->features = NETIF_F_HIGHDMA; dev->ethtool_ops = &virtnet_ethtool_ops; SET_NETDEV_DEV(dev, &vdev->dev); /* Do we support "hardware" checksums? */ if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) { /* This opens up the world of extra features. */ dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG; if (csum) dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG; if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) { dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6; } /* Individual feature bits: what can host handle? */ if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4)) dev->hw_features |= NETIF_F_TSO; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6)) dev->hw_features |= NETIF_F_TSO6; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN)) dev->hw_features |= NETIF_F_TSO_ECN; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_USO)) dev->hw_features |= NETIF_F_GSO_UDP_L4; dev->features |= NETIF_F_GSO_ROBUST; if (gso) dev->features |= dev->hw_features & NETIF_F_ALL_TSO; /* (!csum && gso) case will be fixed by register_netdev() */ } /* 1. With VIRTIO_NET_F_GUEST_CSUM negotiation, the driver doesn't * need to calculate checksums for partially checksummed packets, * as they're considered valid by the upper layer. * 2. Without VIRTIO_NET_F_GUEST_CSUM negotiation, the driver only * receives fully checksummed packets. The device may assist in * validating these packets' checksums, so the driver won't have to. */ dev->features |= NETIF_F_RXCSUM; if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) dev->features |= NETIF_F_GRO_HW; if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) dev->hw_features |= NETIF_F_GRO_HW; dev->vlan_features = dev->features; dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_XSK_ZEROCOPY; /* MTU range: 68 - 65535 */ dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU; /* Configuration may specify what MAC to use. Otherwise random. */ if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { u8 addr[ETH_ALEN]; virtio_cread_bytes(vdev, offsetof(struct virtio_net_config, mac), addr, ETH_ALEN); eth_hw_addr_set(dev, addr); } else { eth_hw_addr_random(dev); dev_info(&vdev->dev, "Assigned random MAC address %pM\n", dev->dev_addr); } /* Set up our device-specific information */ vi = netdev_priv(dev); vi->dev = dev; vi->vdev = vdev; vdev->priv = vi; INIT_WORK(&vi->config_work, virtnet_config_changed_work); INIT_WORK(&vi->rx_mode_work, virtnet_rx_mode_work); spin_lock_init(&vi->refill_lock); if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { vi->mergeable_rx_bufs = true; dev->xdp_features |= NETDEV_XDP_ACT_RX_SG; } if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) vi->has_rss_hash_report = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) { vi->has_rss = true; vi->rss_indir_table_size = virtio_cread16(vdev, offsetof(struct virtio_net_config, rss_max_indirection_table_length)); } vi->rss_hdr = devm_kzalloc(&vdev->dev, virtnet_rss_hdr_size(vi), GFP_KERNEL); if (!vi->rss_hdr) { err = -ENOMEM; goto free; } if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_key_size = virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size)); if (vi->rss_key_size > VIRTIO_NET_RSS_MAX_KEY_SIZE) { dev_err(&vdev->dev, "rss_max_key_size=%u exceeds the limit %u.\n", vi->rss_key_size, VIRTIO_NET_RSS_MAX_KEY_SIZE); err = -EINVAL; goto free; } vi->rss_hash_types_supported = virtio_cread32(vdev, offsetof(struct virtio_net_config, supported_hash_types)); vi->rss_hash_types_supported &= ~(VIRTIO_NET_RSS_HASH_TYPE_IP_EX | VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | VIRTIO_NET_RSS_HASH_TYPE_UDP_EX); dev->hw_features |= NETIF_F_RXHASH; dev->xdp_metadata_ops = &virtnet_xdp_metadata_ops; } if (vi->has_rss_hash_report) vi->hdr_len = sizeof(struct virtio_net_hdr_v1_hash); else if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) || virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); else vi->hdr_len = sizeof(struct virtio_net_hdr); if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) || virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) vi->any_header_sg = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) vi->has_cvq = true; mutex_init(&vi->cvq_lock); if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) { mtu = virtio_cread16(vdev, offsetof(struct virtio_net_config, mtu)); if (mtu < dev->min_mtu) { /* Should never trigger: MTU was previously validated * in virtnet_validate. */ dev_err(&vdev->dev, "device MTU appears to have changed it is now %d < %d", mtu, dev->min_mtu); err = -EINVAL; goto free; } dev->mtu = mtu; dev->max_mtu = mtu; } virtnet_set_big_packets(vi, mtu); if (vi->any_header_sg) dev->needed_headroom = vi->hdr_len; /* Enable multiqueue by default */ if (num_online_cpus() >= max_queue_pairs) vi->curr_queue_pairs = max_queue_pairs; else vi->curr_queue_pairs = num_online_cpus(); vi->max_queue_pairs = max_queue_pairs; /* Allocate/initialize the rx/tx queues, and invoke find_vqs */ err = init_vqs(vi); if (err) goto free; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { vi->intr_coal_rx.max_usecs = 0; vi->intr_coal_tx.max_usecs = 0; vi->intr_coal_rx.max_packets = 0; /* Keep the default values of the coalescing parameters * aligned with the default napi_tx state. */ if (vi->sq[0].napi.weight) vi->intr_coal_tx.max_packets = 1; else vi->intr_coal_tx.max_packets = 0; } if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) { /* The reason is the same as VIRTIO_NET_F_NOTF_COAL. */ for (i = 0; i < vi->max_queue_pairs; i++) if (vi->sq[i].napi.weight) vi->sq[i].intr_coal.max_packets = 1; err = virtnet_init_irq_moder(vi); if (err) goto free; } #ifdef CONFIG_SYSFS if (vi->mergeable_rx_bufs) dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group; #endif netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs); netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs); virtnet_init_settings(dev); if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) { vi->failover = net_failover_create(vi->dev); if (IS_ERR(vi->failover)) { err = PTR_ERR(vi->failover); goto free_vqs; } } if (vi->has_rss || vi->has_rss_hash_report) virtnet_init_default_rss(vi); enable_rx_mode_work(vi); /* serialize netdev register + virtio_device_ready() with ndo_open() */ rtnl_lock(); err = register_netdevice(dev); if (err) { pr_debug("virtio_net: registering device failed\n"); rtnl_unlock(); goto free_failover; } /* Disable config change notification until ndo_open. */ virtio_config_driver_disable(vi->vdev); virtio_device_ready(vdev); if (vi->has_rss || vi->has_rss_hash_report) { if (!virtnet_commit_rss_command(vi)) { dev_warn(&vdev->dev, "RSS disabled because committing failed.\n"); dev->hw_features &= ~NETIF_F_RXHASH; vi->has_rss_hash_report = false; vi->has_rss = false; } } virtnet_set_queues(vi, vi->curr_queue_pairs); /* a random MAC address has been assigned, notify the device. * We don't fail probe if VIRTIO_NET_F_CTRL_MAC_ADDR is not there * because many devices work fine without getting MAC explicitly */ if (!virtio_has_feature(vdev, VIRTIO_NET_F_MAC) && virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { struct scatterlist sg; sg_init_one(&sg, dev->dev_addr, dev->addr_len); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) { pr_debug("virtio_net: setting MAC address failed\n"); rtnl_unlock(); err = -EINVAL; goto free_unregister_netdev; } } if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_DEVICE_STATS)) { struct virtio_net_stats_capabilities *stats_cap __free(kfree) = NULL; struct scatterlist sg; __le64 v; stats_cap = kzalloc(sizeof(*stats_cap), GFP_KERNEL); if (!stats_cap) { rtnl_unlock(); err = -ENOMEM; goto free_unregister_netdev; } sg_init_one(&sg, stats_cap, sizeof(*stats_cap)); if (!virtnet_send_command_reply(vi, VIRTIO_NET_CTRL_STATS, VIRTIO_NET_CTRL_STATS_QUERY, NULL, &sg)) { pr_debug("virtio_net: fail to get stats capability\n"); rtnl_unlock(); err = -EINVAL; goto free_unregister_netdev; } v = stats_cap->supported_stats_types[0]; vi->device_stats_cap = le64_to_cpu(v); } /* Assume link up if device can't report link status, otherwise get link status from config. */ netif_carrier_off(dev); if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { virtnet_config_changed_work(&vi->config_work); } else { vi->status = VIRTIO_NET_S_LINK_UP; virtnet_update_settings(vi); netif_carrier_on(dev); } for (i = 0; i < ARRAY_SIZE(guest_offloads); i++) if (virtio_has_feature(vi->vdev, guest_offloads[i])) set_bit(guest_offloads[i], &vi->guest_offloads); vi->guest_offloads_capable = vi->guest_offloads; rtnl_unlock(); err = virtnet_cpu_notif_add(vi); if (err) { pr_debug("virtio_net: registering cpu notifier failed\n"); goto free_unregister_netdev; } pr_debug("virtnet: registered device %s with %d RX and TX vq's\n", dev->name, max_queue_pairs); return 0; free_unregister_netdev: unregister_netdev(dev); free_failover: net_failover_destroy(vi->failover); free_vqs: virtio_reset_device(vdev); cancel_delayed_work_sync(&vi->refill); free_receive_page_frags(vi); virtnet_del_vqs(vi); free: free_netdev(dev); return err; } static void remove_vq_common(struct virtnet_info *vi) { int i; virtio_reset_device(vi->vdev); /* Free unused buffers in both send and recv, if any. */ free_unused_bufs(vi); /* * Rule of thumb is netdev_tx_reset_queue() should follow any * skb freeing not followed by netdev_tx_completed_queue() */ for (i = 0; i < vi->max_queue_pairs; i++) netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, i)); free_receive_bufs(vi); free_receive_page_frags(vi); virtnet_del_vqs(vi); } static void virtnet_remove(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; virtnet_cpu_notif_remove(vi); /* Make sure no work handler is accessing the device. */ flush_work(&vi->config_work); disable_rx_mode_work(vi); flush_work(&vi->rx_mode_work); virtnet_free_irq_moder(vi); unregister_netdev(vi->dev); net_failover_destroy(vi->failover); remove_vq_common(vi); free_netdev(vi->dev); } static __maybe_unused int virtnet_freeze(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; virtnet_cpu_notif_remove(vi); virtnet_freeze_down(vdev); remove_vq_common(vi); return 0; } static __maybe_unused int virtnet_restore(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; int err; err = virtnet_restore_up(vdev); if (err) return err; virtnet_set_queues(vi, vi->curr_queue_pairs); err = virtnet_cpu_notif_add(vi); if (err) { virtnet_freeze_down(vdev); remove_vq_common(vi); return err; } return 0; } static struct virtio_device_id id_table[] = { { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, { 0 }, }; #define VIRTNET_FEATURES \ VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \ VIRTIO_NET_F_MAC, \ VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \ VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \ VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \ VIRTIO_NET_F_HOST_USO, VIRTIO_NET_F_GUEST_USO4, VIRTIO_NET_F_GUEST_USO6, \ VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \ VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \ VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \ VIRTIO_NET_F_CTRL_MAC_ADDR, \ VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \ VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \ VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL, \ VIRTIO_NET_F_VQ_NOTF_COAL, \ VIRTIO_NET_F_GUEST_HDRLEN, VIRTIO_NET_F_DEVICE_STATS static unsigned int features[] = { VIRTNET_FEATURES, }; static unsigned int features_legacy[] = { VIRTNET_FEATURES, VIRTIO_NET_F_GSO, VIRTIO_F_ANY_LAYOUT, }; static struct virtio_driver virtio_net_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .feature_table_legacy = features_legacy, .feature_table_size_legacy = ARRAY_SIZE(features_legacy), .driver.name = KBUILD_MODNAME, .id_table = id_table, .validate = virtnet_validate, .probe = virtnet_probe, .remove = virtnet_remove, .config_changed = virtnet_config_changed, #ifdef CONFIG_PM_SLEEP .freeze = virtnet_freeze, .restore = virtnet_restore, #endif }; static __init int virtio_net_driver_init(void) { int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online", virtnet_cpu_online, virtnet_cpu_down_prep); if (ret < 0) goto out; virtionet_online = ret; ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead", NULL, virtnet_cpu_dead); if (ret) goto err_dead; ret = register_virtio_driver(&virtio_net_driver); if (ret) goto err_virtio; return 0; err_virtio: cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); err_dead: cpuhp_remove_multi_state(virtionet_online); out: return ret; } module_init(virtio_net_driver_init); static __exit void virtio_net_driver_exit(void) { unregister_virtio_driver(&virtio_net_driver); cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); cpuhp_remove_multi_state(virtionet_online); } module_exit(virtio_net_driver_exit); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio network driver"); MODULE_LICENSE("GPL");
58 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sctp #if !defined(_TRACE_SCTP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCTP_H #include <net/sctp/structs.h> #include <linux/tracepoint.h> TRACE_EVENT(sctp_probe_path, TP_PROTO(struct sctp_transport *sp, const struct sctp_association *asoc), TP_ARGS(sp, asoc), TP_STRUCT__entry( __field(__u64, asoc) __field(__u32, primary) __array(__u8, ipaddr, sizeof(union sctp_addr)) __field(__u32, state) __field(__u32, cwnd) __field(__u32, ssthresh) __field(__u32, flight_size) __field(__u32, partial_bytes_acked) __field(__u32, pathmtu) ), TP_fast_assign( __entry->asoc = (unsigned long)asoc; __entry->primary = (sp == asoc->peer.primary_path); memcpy(__entry->ipaddr, &sp->ipaddr, sizeof(union sctp_addr)); __entry->state = sp->state; __entry->cwnd = sp->cwnd; __entry->ssthresh = sp->ssthresh; __entry->flight_size = sp->flight_size; __entry->partial_bytes_acked = sp->partial_bytes_acked; __entry->pathmtu = sp->pathmtu; ), TP_printk("asoc=%#llx%s ipaddr=%pISpc state=%u cwnd=%u ssthresh=%u " "flight_size=%u partial_bytes_acked=%u pathmtu=%u", __entry->asoc, __entry->primary ? "(*)" : "", __entry->ipaddr, __entry->state, __entry->cwnd, __entry->ssthresh, __entry->flight_size, __entry->partial_bytes_acked, __entry->pathmtu) ); TRACE_EVENT(sctp_probe, TP_PROTO(const struct sctp_endpoint *ep, const struct sctp_association *asoc, struct sctp_chunk *chunk), TP_ARGS(ep, asoc, chunk), TP_STRUCT__entry( __field(__u64, asoc) __field(__u32, mark) __field(__u16, bind_port) __field(__u16, peer_port) __field(__u32, pathmtu) __field(__u32, rwnd) __field(__u16, unack_data) ), TP_fast_assign( struct sk_buff *skb = chunk->skb; __entry->asoc = (unsigned long)asoc; __entry->mark = skb->mark; __entry->bind_port = ep->base.bind_addr.port; __entry->peer_port = asoc->peer.port; __entry->pathmtu = asoc->pathmtu; __entry->rwnd = asoc->peer.rwnd; __entry->unack_data = asoc->unack_data; ), TP_printk("asoc=%#llx mark=%#x bind_port=%d peer_port=%d pathmtu=%d " "rwnd=%u unack_data=%d", __entry->asoc, __entry->mark, __entry->bind_port, __entry->peer_port, __entry->pathmtu, __entry->rwnd, __entry->unack_data) ); #endif /* _TRACE_SCTP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
506 507 168 168 654 653 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 // SPDX-License-Identifier: GPL-2.0+ /* * User-space Probes (UProbes) * * Copyright (C) IBM Corporation, 2008-2012 * Authors: * Srikar Dronamraju * Jim Keniston * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> #include <linux/highmem.h> #include <linux/pagemap.h> /* read_mapping_page */ #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> #include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> #include <linux/rcupdate_trace.h> #include <linux/workqueue.h> #include <linux/srcu.h> #include <linux/oom.h> /* check_stable_address_space */ #include <linux/uprobes.h> #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE static struct rb_root uprobes_tree = RB_ROOT; /* * allows us to skip the uprobe_mmap if there are no uprobe events active * at this time. Probably a fine grained per inode count is better? */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); /* Covers return_instance's uprobe lifetime. */ DEFINE_STATIC_SRCU(uretprobes_srcu); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ union { struct rcu_head rcu; struct work_struct work; }; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; /* "unsigned long" so bitops work */ /* * The generic code assumes that it has two members of unknown type * owned by the arch-specific code: * * insn - copy_insn() saves the original instruction here for * arch_uprobe_analyze_insn(). * * ixol - potentially modified instruction to execute out of * line, copied to xol_area by xol_get_insn_slot(). */ struct arch_uprobe arch; }; struct delayed_uprobe { struct list_head list; struct uprobe *uprobe; struct mm_struct *mm; }; static DEFINE_MUTEX(delayed_uprobe_lock); static LIST_HEAD(delayed_uprobe_list); /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction * mangled by set_swbp(). * * On a breakpoint hit, thread contests for a slot. It frees the * slot after singlestep. Currently a fixed number of slots are * allocated. */ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ unsigned long *bitmap; /* 0 = free slot */ struct page *page; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ unsigned long vaddr; /* Page(s) of instruction slots */ }; static void uprobe_warn(struct task_struct *t, const char *msg) { pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); } /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have * changed after breakpoint was inserted. * - is_register: indicates if we are in register context. * - Return 1 if the specified virtual address is in an * executable vma. */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; if (is_register) flags |= VM_WRITE; return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC; } static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) { return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT); } static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) { return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); } /** * __replace_page - replace page in vma by new page. * based on replace_page in mm/ksm.c * * @vma: vma that holds the pte pointing to page * @addr: address the old @page is mapped at * @old_page: the page we are replacing by new_page * @new_page: the modified page we replace page by * * If @new_page is NULL, only unmap @old_page. * * Returns 0 on success, negative error code otherwise. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct folio *old_folio = page_folio(old_page); struct folio *new_folio; struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; pte_t pte; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); if (new_page) { new_folio = page_folio(new_page); err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); if (err) return err; } /* For folio_free_swap() below */ folio_lock(old_folio); mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) goto unlock; VM_BUG_ON_PAGE(addr != pvmw.address, old_page); pte = ptep_get(pvmw.pte); /* * Handle PFN swap PTES, such as device-exclusive ones, that actually * map pages: simply trigger GUP again to fix it up. */ if (unlikely(!pte_present(pte))) { page_vma_mapped_walk_done(&pvmw); goto unlock; } if (new_page) { folio_get(new_folio); folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(pte)); ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) set_pte_at(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); folio_remove_rmap_pte(old_folio, old_page, vma); if (!folio_mapped(old_folio)) folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); folio_put(old_folio); err = 0; unlock: mmu_notifier_invalidate_range_end(&range); folio_unlock(old_folio); return err; } /** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn * Returns true if @insn is a breakpoint instruction. */ bool __weak is_swbp_insn(uprobe_opcode_t *insn) { return *insn == UPROBE_SWBP_INSN; } /** * is_trap_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_trap_insn * Returns true if @insn is a breakpoint instruction. * * This function is needed for the case where an architecture has multiple * trap instructions (like powerpc). */ bool __weak is_trap_insn(uprobe_opcode_t *insn) { return is_swbp_insn(insn); } static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); kunmap_atomic(kaddr); } static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) { void *kaddr = kmap_atomic(page); memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); kunmap_atomic(kaddr); } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) { uprobe_opcode_t old_opcode; bool is_swbp; /* * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. * We do not check if it is any other 'trap variant' which could * be conditional trap instruction such as the one powerpc supports. * * The logic is that we do not care if the underlying instruction * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { if (is_swbp) /* register: already installed? */ return 0; } else { if (!is_swbp) /* unregister: was it changed by us? */ return 0; } return 1; } static struct delayed_uprobe * delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; list_for_each_entry(du, &delayed_uprobe_list, list) if (du->uprobe == uprobe && du->mm == mm) return du; return NULL; } static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; if (delayed_uprobe_check(uprobe, mm)) return 0; du = kzalloc(sizeof(*du), GFP_KERNEL); if (!du) return -ENOMEM; du->uprobe = uprobe; du->mm = mm; list_add(&du->list, &delayed_uprobe_list); return 0; } static void delayed_uprobe_delete(struct delayed_uprobe *du) { if (WARN_ON(!du)) return; list_del(&du->list); kfree(du); } static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) { struct list_head *pos, *q; struct delayed_uprobe *du; if (!uprobe && !mm) return; list_for_each_safe(pos, q, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (uprobe && du->uprobe != uprobe) continue; if (mm && du->mm != mm) continue; delayed_uprobe_delete(du); } } static bool valid_ref_ctr_vma(struct uprobe *uprobe, struct vm_area_struct *vma) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); return uprobe->ref_ctr_offset && vma->vm_file && file_inode(vma->vm_file) == uprobe->inode && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && vma->vm_start <= vaddr && vma->vm_end > vaddr; } static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; return NULL; } static int __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) { void *kaddr; struct page *page; int ret; short *ptr; if (!vaddr || !d) return -EINVAL; ret = get_user_pages_remote(mm, vaddr, 1, FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, * it may return 0, in that case we have to return error. */ return ret == 0 ? -EBUSY : ret; } kaddr = kmap_atomic(page); ptr = kaddr + (vaddr & ~PAGE_MASK); if (unlikely(*ptr + d < 0)) { pr_warn("ref_ctr going negative. vaddr: 0x%lx, " "curr val: %d, delta: %d\n", vaddr, *ptr, d); ret = -EINVAL; goto out; } *ptr += d; ret = 0; out: kunmap_atomic(kaddr); put_page(page); return ret; } static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); } static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, short d) { struct vm_area_struct *rc_vma; unsigned long rc_vaddr; int ret = 0; rc_vma = find_ref_ctr_vma(uprobe, mm); if (rc_vma) { rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); ret = __update_ref_ctr(mm, rc_vaddr, d); if (ret) update_ref_ctr_warn(uprobe, mm, d); if (d > 0) return ret; } mutex_lock(&delayed_uprobe_lock); if (d > 0) ret = delayed_uprobe_add(uprobe, mm); else delayed_uprobe_remove(uprobe, mm); mutex_unlock(&delayed_uprobe_lock); return ret; } /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct uprobe *uprobe; struct page *old_page, *new_page; struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; bool orig_page_huge = false; unsigned int gup_flags = FOLL_FORCE; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); if (IS_ERR(old_page)) return PTR_ERR(old_page); ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) goto put_old; if (is_zero_page(old_page)) { ret = -EINVAL; goto put_old; } if (WARN(!is_register && PageCompound(old_page), "uprobe unregister should never work on compound page\n")) { ret = -EINVAL; goto put_old; } /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) goto put_old; ref_ctr_updated = 1; } ret = 0; if (!is_register && !PageAnon(old_page)) goto put_old; ret = anon_vma_prepare(vma); if (ret) goto put_old; ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) goto put_old; __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); if (!is_register) { struct page *orig_page; pgoff_t index; VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, index); if (orig_page) { if (PageUptodate(orig_page) && pages_identical(new_page, orig_page)) { /* let go new_page */ put_page(new_page); new_page = NULL; if (PageCompound(orig_page)) orig_page_huge = true; } put_page(orig_page); } } ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); if (new_page) put_page(new_page); put_old: put_page(old_page); if (unlikely(ret == -EAGAIN)) goto retry; /* Revert back reference counter if instruction update failed. */ if (ret && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); /* try collapse pmd for compound page */ if (!ret && orig_page_huge) collapse_pte_mapped_thp(mm, vaddr, false); return ret; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. * @mm: the probed process address space. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); } /** * set_orig_insn - Restore the original instruction. * @mm: the probed process address space. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { return uprobe_write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } /* uprobe should have guaranteed positive refcount */ static struct uprobe *get_uprobe(struct uprobe *uprobe) { refcount_inc(&uprobe->ref); return uprobe; } /* * uprobe should have guaranteed lifetime, which can be either of: * - caller already has refcount taken (and wants an extra one); * - uprobe is RCU protected and won't be freed until after grace period; * - we are holding uprobes_treelock (for read or write, doesn't matter). */ static struct uprobe *try_get_uprobe(struct uprobe *uprobe) { if (refcount_inc_not_zero(&uprobe->ref)) return uprobe; return NULL; } static inline bool uprobe_is_active(struct uprobe *uprobe) { return !RB_EMPTY_NODE(&uprobe->rb_node); } static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); kfree(uprobe); } static void uprobe_free_srcu(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); } static void uprobe_free_deferred(struct work_struct *work) { struct uprobe *uprobe = container_of(work, struct uprobe, work); write_lock(&uprobes_treelock); if (uprobe_is_active(uprobe)) { write_seqcount_begin(&uprobes_seqcount); rb_erase(&uprobe->rb_node, &uprobes_tree); write_seqcount_end(&uprobes_seqcount); } write_unlock(&uprobes_treelock); /* * If application munmap(exec_vma) before uprobe_unregister() * gets called, we don't get a chance to remove uprobe from * delayed_uprobe_list from remove_breakpoint(). Do it here. */ mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(uprobe, NULL); mutex_unlock(&delayed_uprobe_lock); /* start srcu -> rcu_tasks_trace -> kfree chain */ call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); } static void put_uprobe(struct uprobe *uprobe) { if (!refcount_dec_and_test(&uprobe->ref)) return; INIT_WORK(&uprobe->work, uprobe_free_deferred); schedule_work(&uprobe->work); } /* Initialize hprobe as SRCU-protected "leased" uprobe */ static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) { WARN_ON(!uprobe); hprobe->state = HPROBE_LEASED; hprobe->uprobe = uprobe; hprobe->srcu_idx = srcu_idx; } /* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) { hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; hprobe->uprobe = uprobe; hprobe->srcu_idx = -1; } /* * hprobe_consume() fetches hprobe's underlying uprobe and detects whether * uprobe is SRCU protected or is refcounted. hprobe_consume() can be * used only once for a given hprobe. * * Caller has to call hprobe_finalize() and pass previous hprobe_state, so * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever * is appropriate. */ static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) { *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); switch (*hstate) { case HPROBE_LEASED: case HPROBE_STABLE: return hprobe->uprobe; case HPROBE_GONE: /* uprobe is NULL, no SRCU */ case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ return NULL; default: WARN(1, "hprobe invalid state %d", *hstate); return NULL; } } /* * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. * hprobe_finalize() can only be used from current context after * hprobe_consume() call (which determines uprobe and hstate value). */ static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) { switch (hstate) { case HPROBE_LEASED: __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); break; case HPROBE_STABLE: put_uprobe(hprobe->uprobe); break; case HPROBE_GONE: case HPROBE_CONSUMED: break; default: WARN(1, "hprobe invalid state %d", hstate); break; } } /* * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of * them can win the race to perform SRCU unlocking. Whoever wins must perform * SRCU unlock. * * Returns underlying valid uprobe or NULL, if there was no underlying uprobe * to begin with or we failed to bump its refcount and it's going away. * * Returned non-NULL uprobe can be still safely used within an ongoing SRCU * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has * an extra refcount for caller to assume and use. Otherwise, it's not * guaranteed that returned uprobe has a positive refcount, so caller has to * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current * SRCU lock region. See dup_utask(). */ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) { enum hprobe_state hstate; /* * Caller should guarantee that return_instance is not going to be * freed from under us. This can be achieved either through holding * rcu_read_lock() or by owning return_instance in the first place. * * Underlying uprobe is itself protected from reuse by SRCU, so ensure * SRCU lock is held properly. */ lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { case HPROBE_STABLE: /* uprobe has positive refcount, bump refcount, if necessary */ return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; case HPROBE_GONE: /* * SRCU was unlocked earlier and we didn't manage to take * uprobe refcnt, so it's effectively NULL */ return NULL; case HPROBE_CONSUMED: /* * uprobe was consumed, so it's effectively NULL as far as * uretprobe processing logic is concerned */ return NULL; case HPROBE_LEASED: { struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); /* * Try to switch hprobe state, guarding against * hprobe_consume() or another hprobe_expire() racing with us. * Note, if we failed to get uprobe refcount, we use special * HPROBE_GONE state to signal that hprobe->uprobe shouldn't * be used as it will be freed after SRCU is unlocked. */ if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { /* We won the race, we are the ones to unlock SRCU */ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); return get ? get_uprobe(uprobe) : uprobe; } /* * We lost the race, undo refcount bump (if it ever happened), * unless caller would like an extra refcount anyways. */ if (uprobe && !get) put_uprobe(uprobe); /* * Even if hprobe_consume() or another hprobe_expire() wins * the state update race and unlocks SRCU from under us, we * still have a guarantee that underyling uprobe won't be * freed due to ongoing caller's SRCU lock region, so we can * return it regardless. Also, if `get` was true, we also have * an extra ref for the caller to own. This is used in dup_utask(). */ return uprobe; } default: WARN(1, "unknown hprobe state %d", hstate); return NULL; } } static __always_inline int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, const struct uprobe *r) { if (l_inode < r->inode) return -1; if (l_inode > r->inode) return 1; if (l_offset < r->offset) return -1; if (l_offset > r->offset) return 1; return 0; } #define __node_2_uprobe(node) \ rb_entry((node), struct uprobe, rb_node) struct __uprobe_key { struct inode *inode; loff_t offset; }; static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b) { const struct __uprobe_key *a = key; return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b)); } static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) { struct uprobe *u = __node_2_uprobe(a); return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); } /* * Assumes being inside RCU protected region. * No refcount is taken on returned uprobe. */ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { struct __uprobe_key key = { .inode = inode, .offset = offset, }; struct rb_node *node; unsigned int seq; lockdep_assert(rcu_read_lock_trace_held()); do { seq = read_seqcount_begin(&uprobes_seqcount); node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); /* * Lockless RB-tree lookups can result only in false negatives. * If the element is found, it is correct and can be returned * under RCU protection. If we find nothing, we need to * validate that seqcount didn't change. If it did, we have to * try again as we might have missed the element (false * negative). If seqcount is unchanged, search truly failed. */ if (node) return __node_2_uprobe(node); } while (read_seqcount_retry(&uprobes_seqcount, seq)); return NULL; } /* * Attempt to insert a new uprobe into uprobes_tree. * * If uprobe already exists (for given inode+offset), we just increment * refcount of previously existing uprobe. * * If not, a provided new instance of uprobe is inserted into the tree (with * assumed initial refcount == 1). * * In any case, we return a uprobe instance that ends up being in uprobes_tree. * Caller has to clean up new uprobe instance, if it ended up not being * inserted into the tree. * * We assume that uprobes_treelock is held for writing. */ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { struct rb_node *node; again: node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); if (node) { struct uprobe *u = __node_2_uprobe(node); if (!try_get_uprobe(u)) { rb_erase(node, &uprobes_tree); RB_CLEAR_NODE(&u->rb_node); goto again; } return u; } return uprobe; } /* * Acquire uprobes_treelock and insert uprobe into uprobes_tree * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; write_lock(&uprobes_treelock); write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock); return u; } static void ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) { pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) cur_uprobe->ref_ctr_offset, (unsigned long long) uprobe->ref_ctr_offset); } static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, loff_t ref_ctr_offset) { struct uprobe *uprobe, *cur_uprobe; uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) return ERR_PTR(-ENOMEM); uprobe->inode = inode; uprobe->offset = offset; uprobe->ref_ctr_offset = ref_ctr_offset; INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); RB_CLEAR_NODE(&uprobe->rb_node); refcount_set(&uprobe->ref, 1); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ if (cur_uprobe != uprobe) { if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { ref_ctr_mismatch_warn(cur_uprobe, uprobe); put_uprobe(cur_uprobe); kfree(uprobe); return ERR_PTR(-EINVAL); } kfree(uprobe); uprobe = cur_uprobe; } return uprobe; } static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { static atomic64_t id; down_write(&uprobe->consumer_rwsem); list_add_rcu(&uc->cons_node, &uprobe->consumers); uc->id = (__u64) atomic64_inc_return(&id); up_write(&uprobe->consumer_rwsem); } /* * For uprobe @uprobe, delete the consumer @uc. * Should never be called with consumer that's not part of @uprobe->consumers. */ static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); } static int __copy_insn(struct address_space *mapping, struct file *filp, void *insn, int nbytes, loff_t offset) { struct page *page; /* * Ensure that the page that has the original instruction is populated * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; } static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping = uprobe->inode->i_mapping; loff_t offs = uprobe->offset; void *insn = &uprobe->arch.insn; int size = sizeof(uprobe->arch.insn); int len, err = -EIO; /* Copy only available bytes, -EIO if nothing was read */ do { if (offs >= i_size_read(uprobe->inode)) break; len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); err = __copy_insn(mapping, filp, insn, len, offs); if (err) break; insn += len; offs += len; size -= len; } while (size); return err; } static int prepare_uprobe(struct uprobe *uprobe, struct file *file, struct mm_struct *mm, unsigned long vaddr) { int ret = 0; if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; /* TODO: move this into _register, until then we abuse this sem. */ down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; ret = copy_insn(uprobe, file); if (ret) goto out; ret = -ENOTSUPP; if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) goto out; smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: up_write(&uprobe->consumer_rwsem); return ret; } static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { return !uc->filter || uc->filter(uc, mm); } static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { ret = consumer_filter(uc, mm); if (ret) break; } up_read(&uprobe->consumer_rwsem); return ret; } static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) { bool first_uprobe; int ret; ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret; /* * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page(). */ first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); ret = set_swbp(&uprobe->arch, mm, vaddr); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) clear_bit(MMF_HAS_UPROBES, &mm->flags); return ret; } static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } struct map_info { struct map_info *next; struct mm_struct *mm; unsigned long vaddr; }; static inline struct map_info *free_map_info(struct map_info *info) { struct map_info *next = info->next; kfree(info); return next; } static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; struct map_info *info; int more = 0; again: i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); if (prev) prev->next = NULL; } if (!prev) { more++; continue; } if (!mmget_not_zero(vma->vm_mm)) continue; info = prev; prev = prev->next; info->next = curr; curr = info; info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } i_mmap_unlock_read(mapping); if (!more) goto out; prev = curr; while (curr) { mmput(curr->mm); curr = curr->next; } do { info = kmalloc(sizeof(struct map_info), GFP_KERNEL); if (!info) { curr = ERR_PTR(-ENOMEM); goto out; } info->next = prev; prev = info; } while (--more); goto again; out: while (prev) prev = free_map_info(prev); return curr; } static int register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) { bool is_register = !!new; struct map_info *info; int err = 0; percpu_down_write(&dup_mmap_sem); info = build_map_info(uprobe->inode->i_mapping, uprobe->offset, is_register); if (IS_ERR(info)) { err = PTR_ERR(info); goto out; } while (info) { struct mm_struct *mm = info->mm; struct vm_area_struct *vma; if (err && is_register) goto free; /* * We take mmap_lock for writing to avoid the race with * find_active_uprobe_rcu() which takes mmap_lock for reading. * Thus this install_breakpoint() can not make * is_trap_at_addr() true right after find_uprobe_rcu() * returns NULL in find_active_uprobe_rcu(). */ mmap_write_lock(mm); if (check_stable_address_space(mm)) goto unlock; vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) goto unlock; if (vma->vm_start > info->vaddr || vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, mm, info->vaddr); } unlock: mmap_write_unlock(mm); free: mmput(mm); info = free_map_info(info); } out: percpu_up_write(&dup_mmap_sem); return err; } /** * uprobe_unregister_nosync - unregister an already registered probe. * @uprobe: uprobe to remove * @uc: identify which probe if multiple probes are colocated. */ void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; down_write(&uprobe->register_rwsem); consumer_del(uprobe, uc); err = register_for_each_vma(uprobe, NULL); up_write(&uprobe->register_rwsem); /* TODO : cant unregister? schedule a worker thread */ if (unlikely(err)) { uprobe_warn(current, "unregister, leaking uprobe"); return; } put_uprobe(uprobe); } EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); void uprobe_unregister_sync(void) { /* * Now that handler_chain() and handle_uretprobe_chain() iterate over * uprobe->consumers list under RCU protection without holding * uprobe->register_rwsem, we need to wait for RCU grace period to * make sure that we can't call into just unregistered * uprobe_consumer's callbacks anymore. If we don't do that, fast and * unlucky enough caller can free consumer's memory and cause * handler_chain() or handle_uretprobe_chain() to do an use-after-free. */ synchronize_rcu_tasks_trace(); synchronize_srcu(&uretprobes_srcu); } EXPORT_SYMBOL_GPL(uprobe_unregister_sync); /** * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) return ERR_PTR(-EINVAL); /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && !shmem_mapping(inode->i_mapping)) return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return ERR_PTR(-EINVAL); /* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) return ERR_PTR(-EINVAL); uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (IS_ERR(uprobe)) return uprobe; down_write(&uprobe->register_rwsem); consumer_add(uprobe, uc); ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); if (ret) { uprobe_unregister_nosync(uprobe, uc); /* * Registration might have partially succeeded, so we can have * this consumer being called right at this time. We need to * sync here. It's ok, it's unlikely slow path. */ uprobe_unregister_sync(); return ERR_PTR(ret); } return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); /** * uprobe_apply - add or remove the breakpoints according to @uc->filter * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints * Return: 0 on success or negative error code. */ int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { struct uprobe_consumer *con; int ret = -ENOENT; down_write(&uprobe->register_rwsem); rcu_read_lock_trace(); list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { if (con == uc) { ret = register_for_each_vma(uprobe, add ? uc : NULL); break; } } rcu_read_unlock_trace(); up_write(&uprobe->register_rwsem); return ret; } static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; if (!valid_vma(vma, false) || file_inode(vma->vm_file) != uprobe->inode) continue; offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; if (uprobe->offset < offset || uprobe->offset >= offset + vma->vm_end - vma->vm_start) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, mm, vaddr); } mmap_read_unlock(mm); return err; } static struct rb_node * find_node_in_range(struct inode *inode, loff_t min, loff_t max) { struct rb_node *n = uprobes_tree.rb_node; while (n) { struct uprobe *u = rb_entry(n, struct uprobe, rb_node); if (inode < u->inode) { n = n->rb_left; } else if (inode > u->inode) { n = n->rb_right; } else { if (max < u->offset) n = n->rb_left; else if (min > u->offset) n = n->rb_right; else break; } } return n; } /* * For a given range in vma, build a list of probes that need to be inserted. */ static void build_probe_list(struct inode *inode, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *head) { loff_t min, max; struct rb_node *n, *t; struct uprobe *u; INIT_LIST_HEAD(head); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u)) list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u)) list_add(&u->pending_list, head); } } read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) { struct list_head *pos, *q; struct delayed_uprobe *du; unsigned long vaddr; int ret = 0, err = 0; mutex_lock(&delayed_uprobe_lock); list_for_each_safe(pos, q, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (du->mm != vma->vm_mm || !valid_ref_ctr_vma(du->uprobe, vma)) continue; vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); if (ret) { update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); if (!err) err = ret; } delayed_uprobe_delete(du); } mutex_unlock(&delayed_uprobe_lock); return err; } /* * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. */ int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; if (no_uprobe_events()) return 0; if (vma->vm_file && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) delayed_ref_ctr_inc(vma); if (!valid_vma(vma, true)) return 0; inode = file_inode(vma->vm_file); if (!inode) return 0; mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); /* * We can race with uprobe_unregister(), this uprobe can be already * removed. But in this case filter_chain() must return false, all * consumers have gone away. */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); return 0; } static bool vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) { loff_t min, max; struct inode *inode; struct rb_node *n; inode = file_inode(vma->vm_file); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); read_unlock(&uprobes_treelock); return !!n; } /* * Called in context of a munmap of a vma. */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (no_uprobe_events() || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) return; if (vma_has_uprobes(vma, start, end)) set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } static vm_fault_t xol_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { struct xol_area *area = vma->vm_mm->uprobes_state.xol_area; vmf->page = area->page; get_page(vmf->page); return 0; } static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { return -EPERM; } static const struct vm_special_mapping xol_mapping = { .name = "[uprobes]", .fault = xol_fault, .mremap = xol_mremap, }; /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { struct vm_area_struct *vma; int ret; if (mmap_write_lock_killable(mm)) return -EINTR; if (mm->uprobes_state.xol_area) { ret = -EALREADY; goto fail; } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } } vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| VM_SEALED_SYSMAP, &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto fail; } ret = 0; /* pairs with get_xol_area() */ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: mmap_write_unlock(mm); return ret; } void * __weak arch_uprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; *psize = UPROBE_SWBP_INSN_SIZE; return &insn; } static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; unsigned long insns_size; struct xol_area *area; void *insns; area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long), GFP_KERNEL); if (!area->bitmap) goto free_area; area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); if (!area->page) goto free_bitmap; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); insns = arch_uprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) return area; __free_page(area->page); free_bitmap: kfree(area->bitmap); free_area: kfree(area); out: return NULL; } /* * get_xol_area - Allocate process's xol_area if necessary. * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL. */ static struct xol_area *get_xol_area(void) { struct mm_struct *mm = current->mm; struct xol_area *area; if (!mm->uprobes_state.xol_area) __create_xol_area(0); /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area; } /* * uprobe_clear_state - Free the area allocated for slots. */ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); if (!area) return; put_page(area->page); kfree(area->bitmap); kfree(area); } void uprobe_start_dup_mmap(void) { percpu_down_read(&dup_mmap_sem); } void uprobe_end_dup_mmap(void) { percpu_up_read(&dup_mmap_sem); } void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ set_bit(MMF_RECALC_UPROBES, &newmm->flags); } } static unsigned long xol_get_slot_nr(struct xol_area *area) { unsigned long slot_nr; slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); if (slot_nr < UINSNS_PER_PAGE) { if (!test_and_set_bit(slot_nr, area->bitmap)) return slot_nr; } return UINSNS_PER_PAGE; } /* * xol_get_insn_slot - allocate a slot for xol. */ static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) { struct xol_area *area = get_xol_area(); unsigned long slot_nr; if (!area) return false; wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return true; } /* * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() */ static void xol_free_insn_slot(struct uprobe_task *utask) { struct xol_area *area = current->mm->uprobes_state.xol_area; unsigned long offset = utask->xol_vaddr - area->vaddr; unsigned int slot_nr; utask->xol_vaddr = 0; /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ if (WARN_ON_ONCE(offset >= PAGE_SIZE)) return; slot_nr = offset / UPROBE_XOL_SLOT_BYTES; clear_bit(slot_nr, area->bitmap); smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); } void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len) { /* Initialize the slot */ copy_to_page(page, vaddr, src, len); /* * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. */ flush_dcache_page(page); } /** * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs * @regs: Reflects the saved state of the task after it has hit a breakpoint * instruction. * Return the address of the breakpoint instruction. */ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) { return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } unsigned long uprobe_get_trap_addr(struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (unlikely(utask && utask->active_uprobe)) return utask->vaddr; return instruction_pointer(regs); } static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) { ri->cons_cnt = 0; ri->next = utask->ri_pool; utask->ri_pool = ri; } static struct return_instance *ri_pool_pop(struct uprobe_task *utask) { struct return_instance *ri = utask->ri_pool; if (likely(ri)) utask->ri_pool = ri->next; return ri; } static void ri_free(struct return_instance *ri) { kfree(ri->extra_consumers); kfree_rcu(ri, rcu); } static void free_ret_instance(struct uprobe_task *utask, struct return_instance *ri, bool cleanup_hprobe) { unsigned seq; if (cleanup_hprobe) { enum hprobe_state hstate; (void)hprobe_consume(&ri->hprobe, &hstate); hprobe_finalize(&ri->hprobe, hstate); } /* * At this point return_instance is unlinked from utask's * return_instances list and this has become visible to ri_timer(). * If seqcount now indicates that ri_timer's return instance * processing loop isn't active, we can return ri into the pool of * to-be-reused return instances for future uretprobes. If ri_timer() * happens to be running right now, though, we fallback to safety and * just perform RCU-delated freeing of ri. */ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { /* immediate reuse of ri without RCU GP is OK */ ri_pool_push(utask, ri); } else { /* we might be racing with ri_timer(), so play it safe */ ri_free(ri); } } /* * Called with no locks held. * Called in context of an exiting or an exec-ing thread. */ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; struct return_instance *ri, *ri_next; if (!utask) return; t->utask = NULL; WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; while (ri) { ri_next = ri->next; free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } /* free_ret_instance() above might add to ri_pool, so this loop should come last */ ri = utask->ri_pool; while (ri) { ri_next = ri->next; ri_free(ri); ri = ri_next; } kfree(utask); } #define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ #define for_each_ret_instance_rcu(pos, head) \ for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) static void ri_timer(struct timer_list *timer) { struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); struct return_instance *ri; /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ guard(srcu)(&uretprobes_srcu); /* RCU protects return_instance from freeing. */ guard(rcu)(); write_seqcount_begin(&utask->ri_seqcount); for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) { struct uprobe_task *utask; utask = kzalloc(sizeof(*utask), GFP_KERNEL); if (!utask) return NULL; timer_setup(&utask->ri_timer, ri_timer, 0); seqcount_init(&utask->ri_seqcount); return utask; } /* * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise */ static struct uprobe_task *get_utask(void) { if (!current->utask) current->utask = alloc_utask(); return current->utask; } static struct return_instance *alloc_return_instance(struct uprobe_task *utask) { struct return_instance *ri; ri = ri_pool_pop(utask); if (ri) return ri; ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return ZERO_SIZE_PTR; return ri; } static struct return_instance *dup_return_instance(struct return_instance *old) { struct return_instance *ri; ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); if (!ri) return NULL; if (unlikely(old->cons_cnt > 1)) { ri->extra_consumers = kmemdup(old->extra_consumers, sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), GFP_KERNEL); if (!ri->extra_consumers) { kfree(ri); return NULL; } } return ri; } static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) { struct uprobe_task *n_utask; struct return_instance **p, *o, *n; struct uprobe *uprobe; n_utask = alloc_utask(); if (!n_utask) return -ENOMEM; t->utask = n_utask; /* protect uprobes from freeing, we'll need try_get_uprobe() them */ guard(srcu)(&uretprobes_srcu); p = &n_utask->return_instances; for (o = o_utask->return_instances; o; o = o->next) { n = dup_return_instance(o); if (!n) return -ENOMEM; /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ uprobe = hprobe_expire(&o->hprobe, true); /* * New utask will have stable properly refcounted uprobe or * NULL. Even if we failed to get refcounted uprobe, we still * need to preserve full set of return_instances for proper * uretprobe handling and nesting in forked task. */ hprobe_init_stable(&n->hprobe, uprobe); n->next = NULL; rcu_assign_pointer(*p, n); p = &n->next; n_utask->depth++; } return 0; } static void dup_xol_work(struct callback_head *work) { if (current->flags & PF_EXITING) return; if (!__create_xol_area(current->utask->dup_xol_addr) && !fatal_signal_pending(current)) uprobe_warn(current, "dup xol area"); } /* * Called in context of a new clone/fork from copy_process. */ void uprobe_copy_process(struct task_struct *t, unsigned long flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; struct xol_area *area; t->utask = NULL; if (!utask || !utask->return_instances) return; if (mm == t->mm && !(flags & CLONE_VFORK)) return; if (dup_utask(t, utask)) return uprobe_warn(t, "dup ret instances"); /* The task can fork() after dup_xol_work() fails */ area = mm->uprobes_state.xol_area; if (!area) return uprobe_warn(t, "dup xol area"); if (mm == t->mm) return; t->utask->dup_xol_addr = area->vaddr; init_task_work(&t->utask->dup_xol_work, dup_xol_work); task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); } /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. * * Returns -1 in case the xol_area is not allocated. */ unsigned long uprobe_get_trampoline_vaddr(void) { unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area) trampoline_vaddr = area->vaddr; return trampoline_vaddr; } static void cleanup_return_instances(struct uprobe_task *utask, bool chained, struct pt_regs *regs) { struct return_instance *ri = utask->return_instances, *ri_next; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri_next = ri->next; rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } } static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, struct return_instance *ri) { struct uprobe_task *utask = current->utask; unsigned long orig_ret_vaddr, trampoline_vaddr; bool chained; int srcu_idx; if (!get_xol_area()) goto free; if (utask->depth >= MAX_URETPROBE_DEPTH) { printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" " nestedness limit pid/tgid=%d/%d\n", current->pid, current->tgid); goto free; } trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) goto free; /* drop the entries invalidated by longjmp() */ chained = (orig_ret_vaddr == trampoline_vaddr); cleanup_return_instances(utask, chained, regs); /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an * attack from user-space. */ uprobe_warn(current, "handle tail call"); goto free; } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } /* __srcu_read_lock() because SRCU lock survives switch to user space */ srcu_idx = __srcu_read_lock(&uretprobes_srcu); ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); ri->next = utask->return_instances; rcu_assign_pointer(utask->return_instances, ri); mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); return; free: ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { struct uprobe_task *utask = current->utask; int err; if (!try_get_uprobe(uprobe)) return -EINVAL; if (!xol_get_insn_slot(uprobe, utask)) { err = -ENOMEM; goto err_out; } utask->vaddr = bp_vaddr; err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { xol_free_insn_slot(utask); goto err_out; } utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; err_out: put_uprobe(uprobe); return err; } /* * If we are singlestepping, then ensure this thread is not connected to * non-fatal signals until completion of singlestep. When xol insn itself * triggers the signal, restart the original insn even if the task is * already SIGKILL'ed (since coredump should report the correct ip). This * is even more important if the task has a handler for SIGSEGV/etc, The * _same_ instruction should be repeated again after return from the signal * handler, and SSTEP can never finish in this case. */ bool uprobe_deny_signal(void) { struct task_struct *t = current; struct uprobe_task *utask = t->utask; if (likely(!utask || !utask->active_uprobe)) return false; WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; set_tsk_thread_flag(t, TIF_UPROBE); } } return true; } static void mmf_recalc_uprobes(struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. * Or this uprobe can be filtered out. */ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return; } clear_bit(MMF_HAS_UPROBES, &mm->flags); } static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; int result; if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) return -EINVAL; pagefault_disable(); result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) goto out; result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result; copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; struct file *vm_file; loff_t offset; unsigned int seq; guard(rcu)(); if (!mmap_lock_speculate_try_begin(mm, &seq)) return NULL; vma = vma_lookup(mm, bp_vaddr); if (!vma) return NULL; /* * vm_file memory can be reused for another instance of struct file, * but can't be freed from under us, so it's safe to read fields from * it, even if the values are some garbage values; ultimately * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure * that whatever we speculatively found is correct */ vm_file = READ_ONCE(vma->vm_file); if (!vm_file) return NULL; offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); uprobe = find_uprobe_rcu(vm_file->f_inode, offset); if (!uprobe) return NULL; /* now double check that nothing about MM changed */ if (mmap_lock_speculate_retry(mm, seq)) return NULL; return uprobe; } /* assumes being inside RCU protected region */ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; uprobe = find_active_uprobe_speculative(bp_vaddr); if (uprobe) return uprobe; mmap_read_lock(mm); vma = vma_lookup(mm, bp_vaddr); if (vma) { if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); uprobe = find_uprobe_rcu(inode, offset); } if (!uprobe) *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) mmf_recalc_uprobes(mm); mmap_read_unlock(mm); return uprobe; } static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) { struct return_consumer *ric; if (unlikely(ri == ZERO_SIZE_PTR)) return ri; if (unlikely(ri->cons_cnt > 0)) { ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); if (!ric) { ri_free(ri); return ZERO_SIZE_PTR; } ri->extra_consumers = ric; } ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; ric->id = id; ric->cookie = cookie; ri->cons_cnt++; return ri; } static struct return_consumer * return_consumer_find(struct return_instance *ri, int *iter, int id) { struct return_consumer *ric; int idx; for (idx = *iter; idx < ri->cons_cnt; idx++) { ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; if (ric->id == id) { *iter = idx + 1; return ric; } } return NULL; } static bool ignore_ret_handler(int rc) { return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; } static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; bool has_consumers = false, remove = true; struct return_instance *ri = NULL; struct uprobe_task *utask = current->utask; utask->auprobe = &uprobe->arch; list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; __u64 cookie = 0; int rc = 0; if (uc->handler) { rc = uc->handler(uc, regs, &cookie); WARN(rc < 0 || rc > 2, "bad rc=0x%x from %ps()\n", rc, uc->handler); } remove &= rc == UPROBE_HANDLER_REMOVE; has_consumers = true; if (!uc->ret_handler || ignore_ret_handler(rc)) continue; if (!ri) ri = alloc_return_instance(utask); if (session) ri = push_consumer(ri, uc->id, cookie); } utask->auprobe = NULL; if (!ZERO_OR_NULL_PTR(ri)) prepare_uretprobe(uprobe, regs, ri); if (remove && has_consumers) { down_read(&uprobe->register_rwsem); /* re-check that removal is still required, this time under lock */ if (!filter_chain(uprobe, current->mm)) { WARN_ON(!uprobe_is_active(uprobe)); unapply_uprobe(uprobe, current->mm); } up_read(&uprobe->register_rwsem); } } static void handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) { struct return_consumer *ric; struct uprobe_consumer *uc; int ric_idx = 0; /* all consumers unsubscribed meanwhile */ if (unlikely(!uprobe)) return; rcu_read_lock_trace(); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; if (uc->ret_handler) { ric = return_consumer_find(ri, &ric_idx, uc->id); if (!session || ric) uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); } } rcu_read_unlock_trace(); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) { bool chained; do { chained = ri->chained; ri = ri->next; /* can't be NULL if chained */ } while (chained); return ri; } void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *ri_next, *next_chain; struct uprobe *uprobe; enum hprobe_state hstate; bool valid; utask = current->utask; if (!utask) goto sigill; ri = utask->return_instances; if (!ri) goto sigill; do { /* * We should throw out the frames invalidated by longjmp(). * If this chain is valid, then the next one should be alive * or NULL; the latter case means that nobody but ri->func * could hit this trampoline on return. TODO: sigaltstack(). */ next_chain = find_next_ret_chain(ri); valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { /* pop current instance from the stack of pending return instances, * as it's not pending anymore: we just fixed up original * instruction pointer in regs and are about to call handlers; * this allows fixup_uretprobe_trampoline_entries() to properly fix up * captured stack traces from uretprobe handlers, in which pending * trampoline addresses on the stack are replaced with correct * original return addresses */ ri_next = ri->next; rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) handle_uretprobe_chain(ri, uprobe, regs); hprobe_finalize(&ri->hprobe, hstate); /* We already took care of hprobe, no need to waste more time on that. */ free_ret_instance(utask, ri, false /* !cleanup_hprobe */); ri = ri_next; } while (ri != next_chain); } while (!valid); return; sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); force_sig(SIGILL); } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) { return false; } bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs) { return true; } /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. */ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); rcu_read_lock_trace(); uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't * access this memory. The latter is only possible if * another thread plays with our ->mm. In both cases * we can simply restart. If this vma was unmapped we * can pretend this insn was not executed yet and get * the (correct) SIGSEGV after restart. */ instruction_pointer_set(regs, bp_vaddr); } goto out; } /* change it in advance for ->handler() and restart */ instruction_pointer_set(regs, bp_vaddr); /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the * new and not-yet-analyzed uprobe at the same address, restart. */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; /* * Pairs with the smp_wmb() in prepare_uprobe(). * * Guarantees that if we see the UPROBE_COPY_INSN bit set, then * we must also see the stores to &uprobe->arch performed by the * prepare_uprobe() call. */ smp_rmb(); /* Tracing handlers use ->utask to communicate with fetch methods */ if (!get_utask()) goto out; if (arch_uprobe_ignore(&uprobe->arch, regs)) goto out; handler_chain(uprobe, regs); if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; if (pre_ssout(uprobe, regs, bp_vaddr)) goto out; out: /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ rcu_read_unlock_trace(); } /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. */ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else WARN_ON_ONCE(1); put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); if (utask->signal_denied) { set_thread_flag(TIF_SIGPENDING); utask->signal_denied = false; } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); force_sig(SIGILL); } } /* * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and * allows the thread to return from interrupt. After that handle_swbp() * sets utask->active_uprobe. * * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag * and allows the thread to return from interrupt. * * While returning to userspace, thread notices the TIF_UPROBE flag and calls * uprobe_notify_resume(). */ void uprobe_notify_resume(struct pt_regs *regs) { struct uprobe_task *utask; clear_thread_flag(TIF_UPROBE); utask = current->utask; if (utask && utask->active_uprobe) handle_singlestep(utask, regs); else handle_swbp(regs); } /* * uprobe_pre_sstep_notifier gets called from interrupt context as part of * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit. */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { if (!current->mm) return 0; if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) && (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); return 1; } /* * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep. */ int uprobe_post_sstep_notifier(struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (!current->mm || !utask || !utask->active_uprobe) /* task is currently not uprobed */ return 0; utask->state = UTASK_SSTEP_ACK; set_thread_flag(TIF_UPROBE); return 1; } static struct notifier_block uprobe_exception_nb = { .notifier_call = arch_uprobe_exception_notify, .priority = INT_MAX-1, /* notified after kprobes, kgdb */ }; void __init uprobes_init(void) { int i; for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); BUG_ON(register_die_notifier(&uprobe_exception_nb)); }
2076 14 13 14 14 14 14 13 15 12 3 1 14 14 14 14 3 14 3 4 4 4 4 15 15 12 15 4 39 39 2059 2062 2061 2046 2054 1804 1766 1766 1773 1766 1764 1722 660 658 637 335 5 99 331 333 99 335 387 641 640 641 640 7 3 7 4 3 1982 2 2069 1898 1899 642 619 557 641 127 15 117 1745 1746 1751 1747 1748 1746 1744 1747 1750 1748 3 1745 108 1648 1890 1891 1880 1866 1800 1868 5 3 2 1653 1652 5 5 5 315 1684 1685 142 41 141 140 140 140 156 157 102 80 80 101 2 99 99 1984 1931 1981 1986 1986 1982 181 181 1750 1750 1751 1746 1746 111 1754 640 641 639 639 639 639 636 641 640 640 640 600 601 166 168 168 98 88 723 598 638 39 39 39 39 37 4 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/dir.c - kernfs directory implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/sched.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/hash.h> #include "kernfs-internal.h" DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() * will perform wakeups when releasing console_sem. Holding rename_lock * will introduce deadlock if the scheduler reads the kernfs_name in the * wakeup path. */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) static bool __kernfs_active(struct kernfs_node *kn) { return atomic_read(&kn->active) >= 0; } static bool kernfs_active(struct kernfs_node *kn) { lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); return __kernfs_active(kn); } static bool kernfs_lockdep(struct kernfs_node *kn) { #ifdef CONFIG_DEBUG_LOCK_ALLOC return kn->flags & KERNFS_LOCKDEP; #else return false; #endif } /* kernfs_node_depth - compute depth from @from to @to */ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) { size_t depth = 0; while (rcu_dereference(to->__parent) && to != from) { depth++; to = rcu_dereference(to->__parent); } return depth; } static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, struct kernfs_node *b) { size_t da, db; struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b); if (ra != rb) return NULL; da = kernfs_depth(ra->kn, a); db = kernfs_depth(rb->kn, b); while (da > db) { a = rcu_dereference(a->__parent); da--; } while (db > da) { b = rcu_dereference(b->__parent); db--; } /* worst case b and a will be the same at root */ while (b != a) { b = rcu_dereference(b->__parent); a = rcu_dereference(a->__parent); } return a; } /** * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to, * where kn_from is treated as root of the path. * @kn_from: kernfs node which should be treated as root for the path * @kn_to: kernfs node to which path is needed * @buf: buffer to copy the path into * @buflen: size of @buf * * We need to handle couple of scenarios here: * [1] when @kn_from is an ancestor of @kn_to at some level * kn_from: /n1/n2/n3 * kn_to: /n1/n2/n3/n4/n5 * result: /n4/n5 * * [2] when @kn_from is on a different hierarchy and we need to find common * ancestor between @kn_from and @kn_to. * kn_from: /n1/n2/n3/n4 * kn_to: /n1/n2/n5 * result: /../../n5 * OR * kn_from: /n1/n2/n3/n4/n5 [depth=5] * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * * [3] when @kn_to is %NULL result will be "(null)" * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, char *buf, size_t buflen) { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; ssize_t copied; int i, j; if (!kn_to) return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); buf[0] = '\0'; for (i = 0; i < depth_from; i++) { copied = strscpy(buf + len, parent_str, buflen - len); if (copied < 0) return copied; len += copied; } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { const char *name; for (kn = kn_to, j = 0; j < i; j++) kn = rcu_dereference(kn->__parent); name = rcu_dereference(kn->name); len += scnprintf(buf + len, buflen - len, "/%s", name); } return len; } /** * kernfs_name - obtain the name of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is * similar to strscpy(). * * Fills buffer with "(null)" if @kn is %NULL. * * Return: the resulting length of @buf. If @buf isn't long enough, * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG. * * This function can be called from any context. */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { struct kernfs_node *kn_parent; if (!kn) return strscpy(buf, "(null)", buflen); guard(rcu)(); /* * KERNFS_ROOT_INVARIANT_PARENT is ignored here. The name is RCU freed and * the parent is either existing or not. */ kn_parent = rcu_dereference(kn->__parent); return strscpy(buf, kn_parent ? rcu_dereference(kn->name) : "/", buflen); } /** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest * @buf: buffer to copy @to's path into * @buflen: size of @buf * * Builds @to's path relative to @from in @buf. @from and @to must * be on the same kernfs-root. If @from is not parent of @to, then a relative * path (which includes '..'s) as needed to reach from @from to @to is * returned. * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) { struct kernfs_root *root; guard(rcu)(); if (to) { root = kernfs_root(to); if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) { guard(read_lock_irqsave)(&kernfs_rename_lock); return kernfs_path_from_node_locked(to, from, buf, buflen); } } return kernfs_path_from_node_locked(to, from, buf, buflen); } EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * pr_cont_kernfs_path - pr_cont path of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_path(struct kernfs_node *kn) { unsigned long flags; int sz; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { if (sz == -E2BIG) pr_cont("(name too long)"); else pr_cont("(error)"); goto out; } pr_cont("%s", kernfs_pr_cont_buf); out: spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * kernfs_get_parent - determine the parent node and pin it * @kn: kernfs_node of interest * * Determines @kn's parent, pins and returns it. This function can be * called from any context. * * Return: parent node of @kn */ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { struct kernfs_node *parent; unsigned long flags; read_lock_irqsave(&kernfs_rename_lock, flags); parent = kernfs_parent(kn); kernfs_get(parent); read_unlock_irqrestore(&kernfs_rename_lock, flags); return parent; } /** * kernfs_name_hash - calculate hash of @ns + @name * @name: Null terminated string to hash * @ns: Namespace tag to hash * * Return: 31-bit hash of ns + name (so it fits in an off_t) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { unsigned long hash = init_name_hash(ns); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); hash = end_name_hash(hash); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) hash += 2; if (hash >= INT_MAX) hash = INT_MAX - 1; return hash; } static int kernfs_name_compare(unsigned int hash, const char *name, const void *ns, const struct kernfs_node *kn) { if (hash < kn->hash) return -1; if (hash > kn->hash) return 1; if (ns < kn->ns) return -1; if (ns > kn->ns) return 1; return strcmp(name, kernfs_rcu_name(kn)); } static int kernfs_sd_compare(const struct kernfs_node *left, const struct kernfs_node *right) { return kernfs_name_compare(left->hash, kernfs_rcu_name(left), left->ns, right); } /** * kernfs_link_sibling - link kernfs_node into sibling rbtree * @kn: kernfs_node of interest * * Link @kn into its sibling rbtree which starts from * @kn->parent->dir.children. * * Locking: * kernfs_rwsem held exclusive * * Return: * %0 on success, -EEXIST on failure. */ static int kernfs_link_sibling(struct kernfs_node *kn) { struct rb_node *parent = NULL; struct kernfs_node *kn_parent; struct rb_node **node; kn_parent = kernfs_parent(kn); node = &kn_parent->dir.children.rb_node; while (*node) { struct kernfs_node *pos; int result; pos = rb_to_kn(*node); parent = *node; result = kernfs_sd_compare(kn, pos); if (result < 0) node = &pos->rb.rb_left; else if (result > 0) node = &pos->rb.rb_right; else return -EEXIST; } /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn_parent->dir.children); /* successfully added, account subdir number */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn_parent->dir.subdirs++; kernfs_inc_rev(kn_parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; } /** * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree * @kn: kernfs_node of interest * * Try to unlink @kn from its sibling rbtree which starts from * kn->parent->dir.children. * * Return: %true if @kn was actually removed, * %false if @kn wasn't on the rbtree. * * Locking: * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { struct kernfs_node *kn_parent; if (RB_EMPTY_NODE(&kn->rb)) return false; kn_parent = kernfs_parent(kn); down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn_parent->dir.subdirs--; kernfs_inc_rev(kn_parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); rb_erase(&kn->rb, &kn_parent->dir.children); RB_CLEAR_NODE(&kn->rb); return true; } /** * kernfs_get_active - get an active reference to kernfs_node * @kn: kernfs_node to get an active reference to * * Get an active reference of @kn. This function is noop if @kn * is %NULL. * * Return: * Pointer to @kn on success, %NULL on failure. */ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) { if (unlikely(!kn)) return NULL; if (!atomic_inc_unless_negative(&kn->active)) return NULL; if (kernfs_lockdep(kn)) rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); return kn; } /** * kernfs_put_active - put an active reference to kernfs_node * @kn: kernfs_node to put an active reference to * * Put an active reference to @kn. This function is noop if @kn * is %NULL. */ void kernfs_put_active(struct kernfs_node *kn) { int v; if (unlikely(!kn)) return; if (kernfs_lockdep(kn)) rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** * kernfs_drain - drain kernfs_node * @kn: kernfs_node to drain * * Drain existing usages and nuke all existing mmaps of @kn. Multiple * removers may invoke this function concurrently on @kn and all will * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) __releases(&kernfs_root(kn)->kernfs_rwsem) __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); /* * Skip draining if already fully drained. This avoids draining and its * lockdep annotations for nodes which have never been activated * allowing embedding kernfs_remove() in create error paths without * worrying about draining. */ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && !kernfs_should_drain_open_files(kn)) return; up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) lock_contended(&kn->dep_map, _RET_IP_); } wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); rwsem_release(&kn->dep_map, _RET_IP_); } if (kernfs_should_drain_open_files(kn)) kernfs_drain_open_files(kn); down_write(&root->kernfs_rwsem); } /** * kernfs_get - get a reference count on a kernfs_node * @kn: the target kernfs_node */ void kernfs_get(struct kernfs_node *kn) { if (kn) { WARN_ON(!atomic_read(&kn->count)); atomic_inc(&kn->count); } } EXPORT_SYMBOL_GPL(kernfs_get); static void kernfs_free_rcu(struct rcu_head *rcu) { struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); /* If the whole node goes away, then name can't be used outside */ kfree_const(rcu_access_pointer(kn->name)); if (kn->iattr) { simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } kmem_cache_free(kernfs_node_cache, kn); } /** * kernfs_put - put a reference count on a kernfs_node * @kn: the target kernfs_node * * Put a reference count of @kn and destroy it if it reached zero. */ void kernfs_put(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); repeat: /* * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ parent = kernfs_parent(kn); WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, "kernfs_put: %s/%s: released with incorrect active_ref %d\n", parent ? rcu_dereference(parent->name) : "", rcu_dereference(kn->name), atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); call_rcu(&kn->rcu, kernfs_free_rcu); kn = parent; if (kn) { if (atomic_dec_and_test(&kn->count)) goto repeat; } else { /* just released the root kn, free @root too */ idr_destroy(&root->ino_idr); kfree_rcu(root, rcu); } } EXPORT_SYMBOL_GPL(kernfs_put); /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question * * Return: the kernfs_node associated with @dentry. If @dentry is not a * kernfs one, %NULL is returned. * * While the returned kernfs_node will stay accessible as long as @dentry * is accessible, the returned node can be in any state and the caller is * fully responsible for determining what's accessible. */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { if (dentry->d_sb->s_op == &kernfs_sops) return kernfs_dentry_node(dentry); return NULL; } static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); if (!name) return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); if (!kn) goto err_out1; idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); rcu_assign_pointer(kn->name, name); kn->mode = mode; kn->flags = flags; if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = uid, .ia_gid = gid, }; ret = __kernfs_setattr(kn, &iattr); if (ret < 0) goto err_out3; } if (parent) { ret = security_kernfs_init_security(parent, kn); if (ret) goto err_out3; } return kn; err_out3: spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: kfree_const(name); return NULL; } struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; if (parent->mode & S_ISGID) { /* this code block imitates inode_init_owner() for * kernfs */ if (parent->iattr) gid = parent->iattr->ia_gid; if (flags & KERNFS_DIR) mode |= S_ISGID; } kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); rcu_assign_pointer(kn->__parent, parent); } return kn; } /* * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root * @id: the target node id * * @id's lower 32bits encode ino and upper gen. If the gen portion is * zero, all generations are matched. * * Return: %NULL on failure, * otherwise a kernfs node with reference counter incremented. */ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); rcu_read_lock(); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) goto err_unlock; if (sizeof(ino_t) >= sizeof(u64)) { /* we looked up with the low 32bits, compare the whole */ if (kernfs_ino(kn) != ino) goto err_unlock; } else { /* 0 matches all generations */ if (unlikely(gen && kernfs_gen(kn) != gen)) goto err_unlock; } /* * We should fail if @kn has never been activated and guarantee success * if the caller knows that @kn is active. Both can be achieved by * __kernfs_active() which tests @kn->active without kernfs_rwsem. */ if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; rcu_read_unlock(); return kn; err_unlock: rcu_read_unlock(); return NULL; } /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * * Return: * %0 on success, -EEXIST if entry with the given name already * exists. */ int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_root *root = kernfs_root(kn); struct kernfs_iattrs *ps_iattr; struct kernfs_node *parent; bool has_ns; int ret; down_write(&root->kernfs_rwsem); parent = kernfs_parent(kn); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", kernfs_rcu_name(parent), kernfs_rcu_name(kn))) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) goto out_unlock; ret = -ENOENT; if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; kn->hash = kernfs_name_hash(kernfs_rcu_name(kn), kn->ns); ret = kernfs_link_sibling(kn); if (ret) goto out_unlock; /* Update timestamps on the parent */ down_write(&root->kernfs_iattr_rwsem); ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&root->kernfs_iattr_rwsem); up_write(&root->kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. * If not activated here, the kernfs user is responsible for * activating the node with kernfs_activate(). A node which hasn't * been activated is not visible to userland and its removal won't * trigger deactivation. */ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return 0; out_unlock: up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_find_ns - find kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, const unsigned char *name, const void *ns) { struct rb_node *node = parent->dir.children.rb_node; bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", kernfs_rcu_name(parent), name); return NULL; } hash = kernfs_name_hash(name, ns); while (node) { struct kernfs_node *kn; int result; kn = rb_to_kn(node); result = kernfs_name_compare(hash, name, ns, kn); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return kn; } return NULL; } static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { ssize_t len; char *p, *name; lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); spin_lock_irq(&kernfs_pr_cont_lock); len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); if (len < 0) { spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } p = kernfs_pr_cont_buf; while ((name = strsep(&p, "/")) && parent) { if (*name == '\0') continue; parent = kernfs_find_ns(parent, name, ns); } spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** * kernfs_walk_and_get_ns - find and get kernfs_node with the given path * @parent: kernfs_node to search under * @path: path to look for * @ns: the namespace tag to use * * Look for kernfs_node with path @path under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } unsigned int kernfs_root_flags(struct kernfs_node *kn) { return kernfs_root(kn)->flags; } /** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * * Return: the root of the new hierarchy on success, ERR_PTR() value on * failure. */ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { struct kernfs_root *root; struct kernfs_node *kn; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); init_rwsem(&root->kernfs_rwsem); init_rwsem(&root->kernfs_iattr_rwsem); init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); /* * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. * High bits generation. The starting value for both ino and * genenration is 1. Initialize upper 32bit allocation * accordingly. */ if (sizeof(ino_t) >= sizeof(u64)) root->id_highbits = 0; else root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } kn->priv = priv; kn->dir.root = root; root->syscall_ops = scops; root->flags = flags; root->kn = kn; init_waitqueue_head(&root->deactivate_waitq); if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return root; } /** * kernfs_destroy_root - destroy a kernfs hierarchy * @root: root of the hierarchy to destroy * * Destroy the hierarchy anchored at @root by removing all existing * directories and destroying @root. */ void kernfs_destroy_root(struct kernfs_root *root) { /* * kernfs_remove holds kernfs_rwsem from the root so the root * shouldn't be freed during the operation. */ kernfs_get(root->kn); kernfs_remove(root->kn); kernfs_put(root->kn); /* will also free @root */ } /** * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root * @root: root to use to lookup * * Return: @root's kernfs_node */ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) { return root->kn; } /** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory * @uid: uid of the new directory * @gid: gid of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, mode | S_IFDIR, uid, gid, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->dir.root = parent->dir.root; kn->ns = ns; kn->priv = priv; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } /** * kernfs_create_empty_dir - create an always empty directory * @parent: parent in which to create a new directory * @name: name of the new directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->flags |= KERNFS_EMPTY_DIR; kn->dir.root = parent->dir.root; kn->ns = NULL; kn->priv = NULL; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn, *parent; struct kernfs_root *root; if (flags & LOOKUP_RCU) return -ECHILD; /* Negative hashed dentry? */ if (d_really_is_negative(dentry)) { /* If the kernfs parent node has changed discard and * proceed to ->lookup. * * There's nothing special needed here when getting the * dentry parent, even if a concurrent rename is in * progress. That's because the dentry is negative so * it can only be the target of the rename and it will * be doing a d_move() not a replace. Consequently the * dentry d_parent won't change over the d_move(). * * Also kernfs negative dentries transitioning from * negative to positive during revalidate won't happen * because they are invalidated on containing directory * changes and the lookup re-done so that a new positive * dentry can be properly created. */ root = kernfs_root_from_sb(dentry->d_sb); down_read(&root->kernfs_rwsem); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { if (kernfs_dir_changed(parent, dentry)) { up_read(&root->kernfs_rwsem); return 0; } } up_read(&root->kernfs_rwsem); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. */ return 1; } kn = kernfs_dentry_node(dentry); root = kernfs_root(kn); down_read(&root->kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) goto out_bad; parent = kernfs_parent(kn); /* The kernfs node has been moved? */ if (kernfs_dentry_node(dentry->d_parent) != parent) goto out_bad; /* The kernfs node has been renamed */ if (strcmp(dentry->d_name.name, kernfs_rcu_name(kn)) != 0) goto out_bad; /* The kernfs node has been moved to a different namespace */ if (parent && kernfs_ns_enabled(parent) && kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; up_read(&root->kernfs_rwsem); return 1; out_bad: up_read(&root->kernfs_rwsem); return 0; } const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, }; static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct kernfs_root *root; struct inode *inode = NULL; const void *ns = NULL; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ if (kn) { /* Inactive nodes are invisible to the VFS so don't * create a negative. */ if (!kernfs_active(kn)) { up_read(&root->kernfs_rwsem); return NULL; } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); } /* * Needed for negative dentry validation. * The negative dentry can be created in kernfs_iop_lookup() * or transforms from positive dentry in dentry_unlink_inode() * called from vfs_rmdir(). */ if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); up_read(&root->kernfs_rwsem); /* instantiate and hash (possibly negative) dentry */ return d_splice_alias(inode, dentry); } static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) return ERR_PTR(-EPERM); if (!kernfs_get_active(parent)) return ERR_PTR(-ENODEV); ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); return ERR_PTR(ret); } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (!scops || !scops->rmdir) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; ret = scops->rmdir(kn); kernfs_put_active(kn); return ret; } static int kernfs_iop_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (flags) return -EINVAL; if (!scops || !scops->rename) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; if (!kernfs_get_active(new_parent)) { kernfs_put_active(kn); return -ENODEV; } ret = scops->rename(kn, new_parent, new_dentry->d_name.name); kernfs_put_active(new_parent); kernfs_put_active(kn); return ret; } const struct inode_operations kernfs_dir_iops = { .lookup = kernfs_iop_lookup, .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, .mkdir = kernfs_iop_mkdir, .rmdir = kernfs_iop_rmdir, .rename = kernfs_iop_rename, }; static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) { struct kernfs_node *last; while (true) { struct rb_node *rbn; last = pos; if (kernfs_type(pos) != KERNFS_DIR) break; rbn = rb_first(&pos->dir.children); if (!rbn) break; pos = rb_to_kn(rbn); } return last; } /** * kernfs_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: kernfs_node whose descendants to walk * * Find the next descendant to visit for post-order traversal of @root's * descendants. @root is included in the iteration and the last node to be * visited. * * Return: the next descendant to visit or %NULL when done. */ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, struct kernfs_node *root) { struct rb_node *rbn; lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) return kernfs_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ rbn = rb_next(&pos->rb); if (rbn) return kernfs_leftmost_descendant(rb_to_kn(rbn)); /* no sibling left, visit parent */ return kernfs_parent(pos); } static void kernfs_activate_one(struct kernfs_node *kn) { lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); kn->flags |= KERNFS_ACTIVATED; if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) return; WARN_ON_ONCE(rcu_access_pointer(kn->__parent) && RB_EMPTY_NODE(&kn->rb)); WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); } /** * kernfs_activate - activate a node which started deactivated * @kn: kernfs_node whose subtree is to be activated * * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node * needs to be explicitly activated. A node which hasn't been activated * isn't visible to userland and deactivation is skipped during its * removal. This is useful to construct atomic init sequences where * creation of multiple nodes should either succeed or fail atomically. * * The caller is responsible for ensuring that this function is not called * after kernfs_remove*() is invoked on @kn. */ void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) kernfs_activate_one(pos); up_write(&root->kernfs_rwsem); } /** * kernfs_show - show or hide a node * @kn: kernfs_node to show or hide * @show: whether to show or hide * * If @show is %false, @kn is marked hidden and deactivated. A hidden node is * ignored in future activaitons. If %true, the mark is removed and activation * state is restored. This function won't implicitly activate a new node in a * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. * * To avoid recursion complexities, directories aren't supported for now. */ void kernfs_show(struct kernfs_node *kn, bool show) { struct kernfs_root *root = kernfs_root(kn); if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) return; down_write(&root->kernfs_rwsem); if (show) { kn->flags &= ~KERNFS_HIDDEN; if (kn->flags & KERNFS_ACTIVATED) kernfs_activate_one(kn); } else { kn->flags |= KERNFS_HIDDEN; if (kernfs_active(kn)) atomic_add(KN_DEACTIVATED_BIAS, &kn->active); kernfs_drain(kn); } up_write(&root->kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos, *parent; /* Short-circuit if non-root @kn has already finished removal. */ if (!kn) return; lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); /* * This is for kernfs_remove_self() which plays with active ref * after removal. */ if (kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb)) return; pr_debug("kernfs %s: removing\n", kernfs_rcu_name(kn)); /* prevent new usage by marking all nodes removing and deactivating */ pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { pos->flags |= KERNFS_REMOVING; if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); } /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); kernfs_drain(pos); parent = kernfs_parent(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it * to decide who's responsible for cleanups. */ if (!parent || kernfs_unlink_sibling(pos)) { struct kernfs_iattrs *ps_iattr = parent ? parent->iattr : NULL; /* update timestamps on the parent */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_put(pos); } kernfs_put(pos); } while (pos != kn); } /** * kernfs_remove - remove a kernfs_node recursively * @kn: the kernfs_node to remove * * Remove @kn along with all its subdirectories and files. */ void kernfs_remove(struct kernfs_node *kn) { struct kernfs_root *root; if (!kn) return; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); __kernfs_remove(kn); up_write(&root->kernfs_rwsem); } /** * kernfs_break_active_protection - break out of active protection * @kn: the self kernfs_node * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. Each invocation of * this function must also be matched with an invocation of * kernfs_unbreak_active_protection(). * * This function releases the active reference of @kn the caller is * holding. Once this function is called, @kn may be removed at any point * and the caller is solely responsible for ensuring that the objects it * dereferences are accessible. */ void kernfs_break_active_protection(struct kernfs_node *kn) { /* * Take out ourself out of the active ref dependency chain. If * we're called without an active ref, lockdep will complain. */ kernfs_put_active(kn); } /** * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() * @kn: the self kernfs_node * * If kernfs_break_active_protection() was called, this function must be * invoked before finishing the kernfs operation. Note that while this * function restores the active reference, it doesn't and can't actually * restore the active protection - @kn may already or be in the process of * being removed. Once kernfs_break_active_protection() is invoked, that * protection is irreversibly gone for the kernfs operation instance. * * While this function may be called at any point after * kernfs_break_active_protection() is invoked, its most useful location * would be right before the enclosing kernfs operation returns. */ void kernfs_unbreak_active_protection(struct kernfs_node *kn) { /* * @kn->active could be in any state; however, the increment we do * here will be undone as soon as the enclosing kernfs operation * finishes and this temporary bump can't break anything. If @kn * is alive, nothing changes. If @kn is being deactivated, the * soon-to-follow put will either finish deactivation or restore * deactivated state. If @kn is already removed, the temporary * bump is guaranteed to be gone before @kn is released. */ atomic_inc(&kn->active); if (kernfs_lockdep(kn)) rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); } /** * kernfs_remove_self - remove a kernfs_node from its own method * @kn: the self kernfs_node to remove * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. This can be used to * implement a file operation which deletes itself. * * For example, the "delete" file for a sysfs device directory can be * implemented by invoking kernfs_remove_self() on the "delete" file * itself. This function breaks the circular dependency of trying to * deactivate self while holding an active ref itself. It isn't necessary * to modify the usual removal path to use kernfs_remove_self(). The * "delete" implementation can simply invoke kernfs_remove_self() on self * before proceeding with the usual removal path. kernfs will ignore later * kernfs_remove() on self. * * kernfs_remove_self() can be called multiple times concurrently on the * same kernfs_node. Only the first one actually performs removal and * returns %true. All others will wait until the kernfs operation which * won self-removal finishes and return %false. Note that the losers wait * for the completion of not only the winning kernfs_remove_self() but also * the whole kernfs_ops which won the arbitration. This can be used to * guarantee, for example, all concurrent writes to a "delete" file to * finish only after the whole operation is complete. * * Return: %true if @kn is removed by this call, otherwise %false. */ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored * while kernfs_rwsem for held exclusive. The ones which lost * arbitration waits for SUICIDED && drained which can happen only * after the enclosing kernfs operation which executed the winning * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; __kernfs_remove(kn); kn->flags |= KERNFS_SUICIDED; ret = true; } else { wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; DEFINE_WAIT(wait); while (true) { prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); if ((kn->flags & KERNFS_SUICIDED) && atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; up_write(&root->kernfs_rwsem); schedule(); down_write(&root->kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); ret = false; } /* * This must be done while kernfs_rwsem held exclusive; otherwise, * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it * @parent: parent of the target * @name: name of the kernfs_node to remove * @ns: namespace tag of the kernfs_node to remove * * Look for the kernfs_node with @name and @ns under @parent and remove it. * * Return: %0 on success, -ENOENT if such entry doesn't exist. */ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root; if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", name); return -ENOENT; } root = kernfs_root(parent); down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) { kernfs_get(kn); __kernfs_remove(kn); kernfs_put(kn); } up_write(&root->kernfs_rwsem); if (kn) return 0; else return -ENOENT; } /** * kernfs_rename_ns - move and rename a kernfs_node * @kn: target node * @new_parent: new parent to put @sd under * @new_name: new name * @new_ns: new namespace tag * * Return: %0 on success, -errno on failure. */ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { struct kernfs_node *old_parent; struct kernfs_root *root; const char *old_name; int error; /* can't move or rename root */ if (!rcu_access_pointer(kn->__parent)) return -EINVAL; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; old_parent = kernfs_parent(kn); if (root->flags & KERNFS_ROOT_INVARIANT_PARENT) { error = -EINVAL; if (WARN_ON_ONCE(old_parent != new_parent)) goto out; } error = 0; old_name = kernfs_rcu_name(kn); if (!new_name) new_name = old_name; if ((old_parent == new_parent) && (kn->ns == new_ns) && (strcmp(old_name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; if (kernfs_find_ns(new_parent, new_name, new_ns)) goto out; /* rename kernfs_node */ if (strcmp(old_name, new_name) != 0) { error = -ENOMEM; new_name = kstrdup_const(new_name, GFP_KERNEL); if (!new_name) goto out; } else { new_name = NULL; } /* * Move to the appropriate place in the appropriate directories rbtree. */ kernfs_unlink_sibling(kn); /* rename_lock protects ->parent accessors */ if (old_parent != new_parent) { kernfs_get(new_parent); write_lock_irq(&kernfs_rename_lock); rcu_assign_pointer(kn->__parent, new_parent); kn->ns = new_ns; if (new_name) rcu_assign_pointer(kn->name, new_name); write_unlock_irq(&kernfs_rename_lock); kernfs_put(old_parent); } else { /* name assignment is RCU protected, parent is the same */ kn->ns = new_ns; if (new_name) rcu_assign_pointer(kn->name, new_name); } kn->hash = kernfs_name_hash(new_name ?: old_name, kn->ns); kernfs_link_sibling(kn); if (new_name && !is_kernel_rodata((unsigned long)old_name)) kfree_rcu_mightsleep(old_name); error = 0; out: up_write(&root->kernfs_rwsem); return error; } static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) { kernfs_put(filp->private_data); return 0; } static struct kernfs_node *kernfs_dir_pos(const void *ns, struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) { if (pos) { int valid = kernfs_active(pos) && rcu_access_pointer(pos->__parent) == parent && hash == pos->hash; kernfs_put(pos); if (!valid) pos = NULL; } if (!pos && (hash > 1) && (hash < INT_MAX)) { struct rb_node *node = parent->dir.children.rb_node; while (node) { pos = rb_to_kn(node); if (hash < pos->hash) node = node->rb_left; else if (hash > pos->hash) node = node->rb_right; else break; } } /* Skip over entries which are dying/dead or in the wrong namespace */ while (pos && (!kernfs_active(pos) || pos->ns != ns)) { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } return pos; } static struct kernfs_node *kernfs_dir_next_pos(const void *ns, struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) { pos = kernfs_dir_pos(ns, parent, ino, pos); if (pos) { do { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } while (pos && (!kernfs_active(pos) || pos->ns != ns)); } return pos; } static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; struct kernfs_root *root; const void *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { const char *name = kernfs_rcu_name(pos); unsigned int type = fs_umode_to_dtype(pos->mode); int len = strlen(name); ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; kernfs_get(pos); if (!dir_emit(ctx, name, len, ino, type)) { up_read(&root->kernfs_rwsem); return 0; } } up_read(&root->kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; } const struct file_operations kernfs_dir_fops = { .read = generic_read_dir, .iterate_shared = kernfs_fop_readdir, .release = kernfs_dir_fop_release, .llseek = generic_file_llseek, };
6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. */ #include "rxe.h" #include "rxe_hw_counters.h" static const struct rdma_stat_desc rxe_counter_descs[] = { [RXE_CNT_SENT_PKTS].name = "sent_pkts", [RXE_CNT_RCVD_PKTS].name = "rcvd_pkts", [RXE_CNT_DUP_REQ].name = "duplicate_request", [RXE_CNT_OUT_OF_SEQ_REQ].name = "out_of_seq_request", [RXE_CNT_RCV_RNR].name = "rcvd_rnr_err", [RXE_CNT_SND_RNR].name = "send_rnr_err", [RXE_CNT_RCV_SEQ_ERR].name = "rcvd_seq_err", [RXE_CNT_SENDER_SCHED].name = "ack_deferred", [RXE_CNT_RETRY_EXCEEDED].name = "retry_exceeded_err", [RXE_CNT_RNR_RETRY_EXCEEDED].name = "retry_rnr_exceeded_err", [RXE_CNT_COMP_RETRY].name = "completer_retry_err", [RXE_CNT_SEND_ERR].name = "send_err", [RXE_CNT_LINK_DOWNED].name = "link_downed", [RXE_CNT_RDMA_SEND].name = "rdma_sends", [RXE_CNT_RDMA_RECV].name = "rdma_recvs", }; int rxe_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index) { struct rxe_dev *dev = to_rdev(ibdev); unsigned int cnt; if (!port || !stats) return -EINVAL; for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_descs); cnt++) stats->value[cnt] = atomic64_read(&dev->stats_counters[cnt]); return ARRAY_SIZE(rxe_counter_descs); } struct rdma_hw_stats *rxe_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { BUILD_BUG_ON(ARRAY_SIZE(rxe_counter_descs) != RXE_NUM_OF_COUNTERS); return rdma_alloc_hw_stats_struct(rxe_counter_descs, ARRAY_SIZE(rxe_counter_descs), RDMA_HW_STATS_DEFAULT_LIFESPAN); }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 // SPDX-License-Identifier: GPL-2.0-or-later /* * xfrm_output.c - Common IPsec encapsulation code. * * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au> */ #include <linux/errno.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/dst.h> #include <net/gso.h> #include <net/icmp.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_route.h> #include <net/ipv6_stubs.h> #endif #include "xfrm_inout.h" static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb); static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); static int xfrm_skb_check_space(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); int nhead = dst->header_len + LL_RESERVED_SPACE(dst->dev) - skb_headroom(skb); int ntail = dst->dev->needed_tailroom - skb_tailroom(skb); if (nhead <= 0) { if (ntail <= 0) return 0; nhead = 0; } else if (ntail < 0) ntail = 0; return pskb_expand_head(skb, nhead, ntail, GFP_ATOMIC); } /* Children define the path of the packet through the * Linux networking. Thus, destinations are stackable. */ static struct dst_entry *skb_dst_pop(struct sk_buff *skb) { struct dst_entry *child = dst_clone(xfrm_dst_child(skb_dst(skb))); skb_dst_drop(skb); return child; } /* Add encapsulation header. * * The IP header will be moved forward to make space for the encapsulation * header. */ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl * 4; skb_set_inner_transport_header(skb, skb_transport_offset(skb)); skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); skb->transport_header = skb->network_header + ihl; __skb_pull(skb, ihl); memmove(skb_network_header(skb), iph, ihl); return 0; } #if IS_ENABLED(CONFIG_IPV6_MIP6) static int mip6_rthdr_offset(struct sk_buff *skb, u8 **nexthdr, int type) { const unsigned char *nh = skb_network_header(skb); unsigned int offset = sizeof(struct ipv6hdr); unsigned int packet_len; int found_rhdr = 0; packet_len = skb_tail_pointer(skb) - nh; *nexthdr = &ipv6_hdr(skb)->nexthdr; while (offset <= packet_len) { struct ipv6_opt_hdr *exthdr; switch (**nexthdr) { case NEXTHDR_HOP: break; case NEXTHDR_ROUTING: if (type == IPPROTO_ROUTING && offset + 3 <= packet_len) { struct ipv6_rt_hdr *rt; rt = (struct ipv6_rt_hdr *)(nh + offset); if (rt->type != 0) return offset; } found_rhdr = 1; break; case NEXTHDR_DEST: /* HAO MUST NOT appear more than once. * XXX: It is better to try to find by the end of * XXX: packet if HAO exists. */ if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) { net_dbg_ratelimited("mip6: hao exists already, override\n"); return offset; } if (found_rhdr) return offset; break; default: return offset; } if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) return -EINVAL; exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); offset += ipv6_optlen(exthdr); if (offset > IPV6_MAXPLEN) return -EINVAL; *nexthdr = &exthdr->nexthdr; } return -EINVAL; } #endif #if IS_ENABLED(CONFIG_IPV6) static int xfrm6_hdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr) { switch (x->type->proto) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: return mip6_rthdr_offset(skb, prevhdr, x->type->proto); #endif default: break; } return ip6_find_1stfragopt(skb, prevhdr); } #endif /* Add encapsulation header. * * The IP header and mutable extension headers will be moved forward to make * space for the encapsulation header. */ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *iph; u8 *prevhdr; int hdr_len; iph = ipv6_hdr(skb); skb_set_inner_transport_header(skb, skb_transport_offset(skb)); hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr); if (hdr_len < 0) return hdr_len; skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); skb_set_network_header(skb, -x->props.header_len); skb->transport_header = skb->network_header + hdr_len; __skb_pull(skb, hdr_len); memmove(ipv6_hdr(skb), iph, hdr_len); return 0; #else WARN_ON_ONCE(1); return -EAFNOSUPPORT; #endif } /* Add route optimization header space. * * The IP header and mutable extension headers will be moved forward to make * space for the route optimization header. */ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *iph; u8 *prevhdr; int hdr_len; iph = ipv6_hdr(skb); hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr); if (hdr_len < 0) return hdr_len; skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); skb_set_network_header(skb, -x->props.header_len); skb->transport_header = skb->network_header + hdr_len; __skb_pull(skb, hdr_len); memmove(ipv6_hdr(skb), iph, hdr_len); return 0; #else WARN_ON_ONCE(1); return -EAFNOSUPPORT; #endif } /* Add encapsulation header. * * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. */ static int xfrm4_beet_encap_add(struct xfrm_state *x, struct sk_buff *skb) { struct ip_beet_phdr *ph; struct iphdr *top_iph; int hdrlen, optlen; hdrlen = 0; optlen = XFRM_MODE_SKB_CB(skb)->optlen; if (unlikely(optlen)) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); skb_set_network_header(skb, -x->props.header_len - hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); if (x->sel.family != AF_INET6) skb->network_header += IPV4_BEET_PHMAXLEN; skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); skb->transport_header = skb->network_header + sizeof(*top_iph); xfrm4_beet_make_header(skb); ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen); top_iph = ip_hdr(skb); if (unlikely(optlen)) { if (WARN_ON(optlen < 0)) return -EINVAL; ph->padlen = 4 - (optlen & 4); ph->hdrlen = optlen / 8; ph->nexthdr = top_iph->protocol; if (ph->padlen) memset(ph + 1, IPOPT_NOP, ph->padlen); top_iph->protocol = IPPROTO_BEETPH; top_iph->ihl = sizeof(struct iphdr) / 4; } top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; return 0; } /* Add encapsulation header. * * The top IP header will be constructed per RFC 2401. */ static int xfrm4_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) { bool small_ipv6 = (skb->protocol == htons(ETH_P_IPV6)) && (skb->len <= IPV6_MIN_MTU); struct dst_entry *dst = skb_dst(skb); struct iphdr *top_iph; int flags; skb_set_inner_network_header(skb, skb_network_offset(skb)); skb_set_inner_transport_header(skb, skb_transport_offset(skb)); skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); skb->transport_header = skb->network_header + sizeof(*top_iph); top_iph = ip_hdr(skb); top_iph->ihl = 5; top_iph->version = 4; top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family); /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */ if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) top_iph->tos = 0; else top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos; top_iph->tos = INET_ECN_encapsulate(top_iph->tos, XFRM_MODE_SKB_CB(skb)->tos); flags = x->props.flags; if (flags & XFRM_STATE_NOECN) IP_ECN_clear(top_iph); top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) || small_ipv6 ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; ip_select_ident(dev_net(dst->dev), skb, NULL); return 0; } #if IS_ENABLED(CONFIG_IPV6) static int xfrm6_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *top_iph; int dsfield; skb_set_inner_network_header(skb, skb_network_offset(skb)); skb_set_inner_transport_header(skb, skb_transport_offset(skb)); skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr); skb->transport_header = skb->network_header + sizeof(*top_iph); top_iph = ipv6_hdr(skb); top_iph->version = 6; memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, sizeof(top_iph->flow_lbl)); top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family); if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) dsfield = 0; else dsfield = XFRM_MODE_SKB_CB(skb)->tos; dsfield = INET_ECN_encapsulate(dsfield, XFRM_MODE_SKB_CB(skb)->tos); if (x->props.flags & XFRM_STATE_NOECN) dsfield &= ~INET_ECN_MASK; ipv6_change_dsfield(top_iph, 0, dsfield); top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = *(struct in6_addr *)&x->props.saddr; top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; } static int xfrm6_beet_encap_add(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *top_iph; struct ip_beet_phdr *ph; int optlen, hdr_len; hdr_len = 0; optlen = XFRM_MODE_SKB_CB(skb)->optlen; if (unlikely(optlen)) hdr_len += IPV4_BEET_PHMAXLEN - (optlen & 4); skb_set_network_header(skb, -x->props.header_len - hdr_len); if (x->sel.family != AF_INET6) skb->network_header += IPV4_BEET_PHMAXLEN; skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr); skb->transport_header = skb->network_header + sizeof(*top_iph); ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdr_len); xfrm6_beet_make_header(skb); top_iph = ipv6_hdr(skb); if (unlikely(optlen)) { if (WARN_ON(optlen < 0)) return -EINVAL; ph->padlen = 4 - (optlen & 4); ph->hdrlen = optlen / 8; ph->nexthdr = top_iph->nexthdr; if (ph->padlen) memset(ph + 1, IPOPT_NOP, ph->padlen); top_iph->nexthdr = IPPROTO_BEETPH; } top_iph->saddr = *(struct in6_addr *)&x->props.saddr; top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; } #endif /* Add encapsulation header. * * On exit, the transport header will be set to the start of the * encapsulation header to be filled in by x->type->output and the mac * header will be set to the nextheader (protocol for IPv4) field of the * extension header directly preceding the encapsulation header, or in * its absence, that of the top IP header. * The value of the network header will always point to the top IP header * while skb->data will point to the payload. */ static int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) { int err; err = xfrm_inner_extract_output(x, skb); if (err) return err; IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; skb->protocol = htons(ETH_P_IP); switch (x->props.mode) { case XFRM_MODE_BEET: return xfrm4_beet_encap_add(x, skb); case XFRM_MODE_TUNNEL: return xfrm4_tunnel_encap_add(x, skb); } WARN_ON_ONCE(1); return -EOPNOTSUPP; } static int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) int err; err = xfrm_inner_extract_output(x, skb); if (err) return err; skb->ignore_df = 1; skb->protocol = htons(ETH_P_IPV6); switch (x->props.mode) { case XFRM_MODE_BEET: return xfrm6_beet_encap_add(x, skb); case XFRM_MODE_TUNNEL: return xfrm6_tunnel_encap_add(x, skb); default: WARN_ON_ONCE(1); return -EOPNOTSUPP; } #endif WARN_ON_ONCE(1); return -EAFNOSUPPORT; } static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) { switch (x->props.mode) { case XFRM_MODE_BEET: case XFRM_MODE_TUNNEL: if (x->props.family == AF_INET) return xfrm4_prepare_output(x, skb); if (x->props.family == AF_INET6) return xfrm6_prepare_output(x, skb); break; case XFRM_MODE_TRANSPORT: if (x->props.family == AF_INET) return xfrm4_transport_output(x, skb); if (x->props.family == AF_INET6) return xfrm6_transport_output(x, skb); break; case XFRM_MODE_ROUTEOPTIMIZATION: if (x->props.family == AF_INET6) return xfrm6_ro_output(x, skb); WARN_ON_ONCE(1); break; default: if (x->mode_cbs && x->mode_cbs->prepare_output) return x->mode_cbs->prepare_output(x, skb); WARN_ON_ONCE(1); break; } return -EOPNOTSUPP; } #if IS_ENABLED(CONFIG_NET_PKTGEN) int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) { return xfrm_outer_mode_output(x, skb); } EXPORT_SYMBOL_GPL(pktgen_xfrm_outer_mode_output); #endif static int xfrm_output_one(struct sk_buff *skb, int err) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; struct net *net = xs_net(x); if (err <= 0 || x->xso.type == XFRM_DEV_OFFLOAD_PACKET) goto resume; do { err = xfrm_skb_check_space(skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); goto error_nolock; } skb->mark = xfrm_smark_get(skb->mark, x); err = xfrm_outer_mode_output(x, skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); goto error_nolock; } spin_lock_bh(&x->lock); if (unlikely(x->km.state != XFRM_STATE_VALID)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEINVALID); err = -EINVAL; goto error; } err = xfrm_state_check_expire(x); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEEXPIRED); goto error; } err = xfrm_replay_overflow(x, skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATESEQERROR); goto error; } x->curlft.bytes += skb->len; x->curlft.packets++; x->lastused = ktime_get_real_seconds(); spin_unlock_bh(&x->lock); skb_dst_force(skb); if (!skb_dst(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); err = -EHOSTUNREACH; goto error_nolock; } if (xfrm_offload(skb)) { x->type_offload->encap(x, skb); } else { /* Inner headers are invalid now. */ skb->encapsulation = 0; err = x->type->output(x, skb); if (err == -EINPROGRESS) goto out; } resume: if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR); goto error_nolock; } dst = skb_dst_pop(skb); if (!dst) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); err = -EHOSTUNREACH; goto error_nolock; } skb_dst_set(skb, dst); x = dst->xfrm; } while (x && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL)); return 0; error: spin_unlock_bh(&x->lock); error_nolock: kfree_skb(skb); out: return err; } int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err) { struct net *net = xs_net(skb_dst(skb)->xfrm); while (likely((err = xfrm_output_one(skb, err)) == 0)) { nf_reset_ct(skb); err = skb_dst(skb)->ops->local_out(net, sk, skb); if (unlikely(err != 1)) goto out; if (!skb_dst(skb)->xfrm) return dst_output(net, sk, skb); err = nf_hook(skb_dst(skb)->ops->family, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, xfrm_output2); if (unlikely(err != 1)) goto out; } if (err == -EINPROGRESS) err = 0; out: return err; } EXPORT_SYMBOL_GPL(xfrm_output_resume); static int xfrm_dev_direct_output(struct sock *sk, struct xfrm_state *x, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net *net = xs_net(x); int err; dst = skb_dst_pop(skb); if (!dst) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); kfree_skb(skb); return -EHOSTUNREACH; } skb_dst_set(skb, dst); nf_reset_ct(skb); err = skb_dst(skb)->ops->local_out(net, sk, skb); if (unlikely(err != 1)) { kfree_skb(skb); return err; } /* In transport mode, network destination is * directly reachable, while in tunnel mode, * inner packet network may not be. In packet * offload type, HW is responsible for hard * header packet mangling so directly xmit skb * to netdevice. */ skb->dev = x->xso.dev; __skb_push(skb, skb->dev->hard_header_len); return dev_queue_xmit(skb); } static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { return xfrm_output_resume(sk, skb, 1); } static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb) { struct sk_buff *segs, *nskb; BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET); BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_GSO_CB_OFFSET); segs = skb_gso_segment(skb, 0); kfree_skb(skb); if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) return -EINVAL; skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); err = xfrm_output2(net, sk, segs); if (unlikely(err)) { kfree_skb_list(nskb); return err; } } return 0; } /* For partial checksum offload, the outer header checksum is calculated * by software and the inner header checksum is calculated by hardware. * This requires hardware to know the inner packet type to calculate * the inner header checksum. Save inner ip protocol here to avoid * traversing the packet in the vendor's xmit code. * For IPsec tunnel mode save the ip protocol from the IP header of the * plain text packet. Otherwise If the encap type is IPIP, just save * skb->inner_ipproto in any other case get the ip protocol from the IP * header. */ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x) { struct xfrm_offload *xo = xfrm_offload(skb); const struct ethhdr *eth; if (!xo) return; if (x->outer_mode.encap == XFRM_MODE_TUNNEL) { switch (x->outer_mode.family) { case AF_INET: xo->inner_ipproto = ip_hdr(skb)->protocol; break; case AF_INET6: xo->inner_ipproto = ipv6_hdr(skb)->nexthdr; break; default: break; } return; } if (x->outer_mode.encap == XFRM_MODE_IPTFS) { xo->inner_ipproto = IPPROTO_AGGFRAG; return; } /* non-Tunnel Mode */ if (!skb->encapsulation) return; if (skb->inner_protocol_type == ENCAP_TYPE_IPPROTO) { xo->inner_ipproto = skb->inner_ipproto; return; } if (skb->inner_protocol_type != ENCAP_TYPE_ETHER) return; eth = (struct ethhdr *)skb_inner_mac_header(skb); switch (ntohs(eth->h_proto)) { case ETH_P_IPV6: xo->inner_ipproto = inner_ipv6_hdr(skb)->nexthdr; break; case ETH_P_IP: xo->inner_ipproto = inner_ip_hdr(skb)->protocol; break; } } int xfrm_output(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); struct xfrm_state *x = skb_dst(skb)->xfrm; int family; int err; family = (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) ? x->outer_mode.family : skb_dst(skb)->ops->family; switch (family) { case AF_INET: memset(IPCB(skb), 0, sizeof(*IPCB(skb))); IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; break; case AF_INET6: memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; break; } if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) { if (!xfrm_dev_offload_ok(skb, x)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); kfree_skb(skb); return -EHOSTUNREACH; } /* Exclusive direct xmit for tunnel mode, as * some filtering or matching rules may apply * in transport mode. */ if (x->props.mode == XFRM_MODE_TUNNEL) return xfrm_dev_direct_output(sk, x, skb); return xfrm_output_resume(sk, skb, 0); } secpath_reset(skb); if (xfrm_dev_offload_ok(skb, x)) { struct sec_path *sp; sp = secpath_set(skb); if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); kfree_skb(skb); return -ENOMEM; } sp->olen++; sp->xvec[sp->len++] = x; xfrm_state_hold(x); xfrm_get_inner_ipproto(skb, x); skb->encapsulation = 1; if (skb_is_gso(skb)) { if (skb->inner_protocol && x->props.mode == XFRM_MODE_TUNNEL) return xfrm_output_gso(net, sk, skb); skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; goto out; } if (x->xso.dev && x->xso.dev->features & NETIF_F_HW_ESP_TX_CSUM) goto out; } else { if (skb_is_gso(skb)) return xfrm_output_gso(net, sk, skb); } if (skb->ip_summed == CHECKSUM_PARTIAL) { err = skb_checksum_help(skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); kfree_skb(skb); return err; } } out: return xfrm_output2(net, sk, skb); } EXPORT_SYMBOL_GPL(xfrm_output); int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df) goto out; mtu = dst_mtu(skb_dst(skb)); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) { skb->protocol = htons(ETH_P_IP); if (skb->sk && sk_fullsock(skb->sk)) xfrm_local_error(skb, mtu); else icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); ret = -EMSGSIZE; } out: return ret; } EXPORT_SYMBOL_GPL(xfrm4_tunnel_check_size); static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) { int err; if (x->outer_mode.encap == XFRM_MODE_BEET && ip_is_fragment(ip_hdr(skb))) { net_warn_ratelimited("BEET mode doesn't support inner IPv4 fragments\n"); return -EAFNOSUPPORT; } err = xfrm4_tunnel_check_size(skb); if (err) return err; XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol; xfrm4_extract_header(skb); return 0; } #if IS_ENABLED(CONFIG_IPV6) int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; struct dst_entry *dst = skb_dst(skb); struct sock *sk = skb_to_full_sk(skb); if (skb->ignore_df) goto out; mtu = dst_mtu(dst); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb)))) { skb->dev = dst->dev; skb->protocol = htons(ETH_P_IPV6); if (xfrm6_local_dontfrag(sk)) ipv6_stub->xfrm6_local_rxpmtu(skb, mtu); else if (sk) xfrm_local_error(skb, mtu); else icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ret = -EMSGSIZE; } out: return ret; } EXPORT_SYMBOL_GPL(xfrm6_tunnel_check_size); #endif static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) int err; err = xfrm6_tunnel_check_size(skb); if (err) return err; XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr; xfrm6_extract_header(skb); return 0; #else WARN_ON_ONCE(1); return -EAFNOSUPPORT; #endif } static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): return xfrm4_extract_output(x, skb); case htons(ETH_P_IPV6): return xfrm6_extract_output(x, skb); } return -EAFNOSUPPORT; } void xfrm_local_error(struct sk_buff *skb, int mtu) { unsigned int proto; struct xfrm_state_afinfo *afinfo; if (skb->protocol == htons(ETH_P_IP)) proto = AF_INET; else if (skb->protocol == htons(ETH_P_IPV6) && skb->sk->sk_family == AF_INET6) proto = AF_INET6; else return; afinfo = xfrm_state_get_afinfo(proto); if (afinfo) { afinfo->local_error(skb, mtu); rcu_read_unlock(); } } EXPORT_SYMBOL_GPL(xfrm_local_error);
26 29 26 30 26 26 13 11 9 7 5 2 2 26 26 13 13 26 6 20 18 2 20 20 20 23 16 18 20 8 8 14 13 13 19 15 1 3 19 20 16 7 2 26 26 1 2 18 16 2 1 19 17 4 19 19 3 15 1 6 20 14 13 7 14 7 7 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 /* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/in.h> #include <linux/export.h> #include <linux/sched/clock.h> #include <linux/time.h> #include <linux/rds.h> #include "rds.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, struct in6_addr *saddr) { refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; inc->i_saddr = *saddr; inc->i_usercopy.rdma_cookie = 0; inc->i_usercopy.rx_tstamp = ktime_set(0, 0); memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace)); } EXPORT_SYMBOL_GPL(rds_inc_init); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, struct in6_addr *saddr) { refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = cp->cp_conn; inc->i_conn_path = cp; inc->i_saddr = *saddr; inc->i_usercopy.rdma_cookie = 0; inc->i_usercopy.rx_tstamp = ktime_set(0, 0); } EXPORT_SYMBOL_GPL(rds_inc_path_init); static void rds_inc_addref(struct rds_incoming *inc) { rdsdebug("addref inc %p ref %d\n", inc, refcount_read(&inc->i_refcount)); refcount_inc(&inc->i_refcount); } void rds_inc_put(struct rds_incoming *inc) { rdsdebug("put inc %p ref %d\n", inc, refcount_read(&inc->i_refcount)); if (refcount_dec_and_test(&inc->i_refcount)) { BUG_ON(!list_empty(&inc->i_item)); inc->i_conn->c_trans->inc_free(inc); } } EXPORT_SYMBOL_GPL(rds_inc_put); static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, struct rds_cong_map *map, int delta, __be16 port) { int now_congested; if (delta == 0) return; rs->rs_rcv_bytes += delta; if (delta > 0) rds_stats_add(s_recv_bytes_added_to_socket, delta); else rds_stats_add(s_recv_bytes_removed_from_socket, -delta); /* loop transport doesn't send/recv congestion updates */ if (rs->rs_transport->t_type == RDS_TRANS_LOOP) return; now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d " "now_cong %d delta %d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, rds_sk_rcvbuf(rs), now_congested, delta); /* wasn't -> am congested */ if (!rs->rs_congested && now_congested) { rs->rs_congested = 1; rds_cong_set_bit(map, port); rds_cong_queue_updates(map); } /* was -> aren't congested */ /* Require more free space before reporting uncongested to prevent bouncing cong/uncong state too often */ else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) { rs->rs_congested = 0; rds_cong_clear_bit(map, port); rds_cong_queue_updates(map); } /* do nothing if no change in cong state */ } static void rds_conn_peer_gen_update(struct rds_connection *conn, u32 peer_gen_num) { int i; struct rds_message *rm, *tmp; unsigned long flags; WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP); if (peer_gen_num != 0) { if (conn->c_peer_gen_num != 0 && peer_gen_num != conn->c_peer_gen_num) { for (i = 0; i < RDS_MPATH_WORKERS; i++) { struct rds_conn_path *cp; cp = &conn->c_path[i]; spin_lock_irqsave(&cp->cp_lock, flags); cp->cp_next_tx_seq = 1; cp->cp_next_rx_seq = 0; list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { set_bit(RDS_MSG_FLUSH, &rm->m_flags); } spin_unlock_irqrestore(&cp->cp_lock, flags); } } conn->c_peer_gen_num = peer_gen_num; } } /* * Process all extension headers that come with this message. */ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs) { struct rds_header *hdr = &inc->i_hdr; unsigned int pos = 0, type, len; union { struct rds_ext_header_version version; struct rds_ext_header_rdma rdma; struct rds_ext_header_rdma_dest rdma_dest; } buffer; while (1) { len = sizeof(buffer); type = rds_message_next_extension(hdr, &pos, &buffer, &len); if (type == RDS_EXTHDR_NONE) break; /* Process extension header here */ switch (type) { case RDS_EXTHDR_RDMA: rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0); break; case RDS_EXTHDR_RDMA_DEST: /* We ignore the size for now. We could stash it * somewhere and use it for error checking. */ inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie( be32_to_cpu(buffer.rdma_dest.h_rdma_rkey), be32_to_cpu(buffer.rdma_dest.h_rdma_offset)); break; } } } static void rds_recv_hs_exthdrs(struct rds_header *hdr, struct rds_connection *conn) { unsigned int pos = 0, type, len; union { struct rds_ext_header_version version; u16 rds_npaths; u32 rds_gen_num; } buffer; u32 new_peer_gen_num = 0; while (1) { len = sizeof(buffer); type = rds_message_next_extension(hdr, &pos, &buffer, &len); if (type == RDS_EXTHDR_NONE) break; /* Process extension header here */ switch (type) { case RDS_EXTHDR_NPATHS: conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, be16_to_cpu(buffer.rds_npaths)); break; case RDS_EXTHDR_GEN_NUM: new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num); break; default: pr_warn_ratelimited("ignoring unknown exthdr type " "0x%x\n", type); } } /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ conn->c_npaths = max_t(int, conn->c_npaths, 1); conn->c_ping_triggered = 0; rds_conn_peer_gen_update(conn, new_peer_gen_num); } /* rds_start_mprds() will synchronously start multiple paths when appropriate. * The scheme is based on the following rules: * * 1. rds_sendmsg on first connect attempt sends the probe ping, with the * sender's npaths (s_npaths) * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It * sends back a probe-pong with r_npaths. After that, if rcvr is the * smaller ip addr, it starts rds_conn_path_connect_if_down on all * mprds_paths. * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down. * If it is the smaller ipaddr, rds_conn_path_connect_if_down can be * called after reception of the probe-pong on all mprds_paths. * Otherwise (sender of probe-ping is not the smaller ip addr): just call * rds_conn_path_connect_if_down on the hashed path. (see rule 4) * 4. rds_connect_worker must only trigger a connection if laddr < faddr. * 5. sender may end up queuing the packet on the cp. will get sent out later. * when connection is completed. */ static void rds_start_mprds(struct rds_connection *conn) { int i; struct rds_conn_path *cp; if (conn->c_npaths > 1 && rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) { for (i = 0; i < conn->c_npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_connect_if_down(cp); } } } /* * The transport must make sure that this is serialized against other * rx and conn reset on this specific conn. * * We currently assert that only one fragmented message will be sent * down a connection at a time. This lets us reassemble in the conn * instead of per-flow which means that we don't have to go digging through * flows to tear down partial reassembly progress on conn failure and * we save flow lookup and locking for each frag arrival. It does mean * that small messages will wait behind large ones. Fragmenting at all * is only to reduce the memory consumption of pre-posted buffers. * * The caller passes in saddr and daddr instead of us getting it from the * conn. This lets loopback, who only has one conn for both directions, * tell us which roles the addrs in the conn are playing for this message. */ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp) { struct rds_sock *rs = NULL; struct sock *sk; unsigned long flags; struct rds_conn_path *cp; inc->i_conn = conn; inc->i_rx_jiffies = jiffies; if (conn->c_trans->t_mp_capable) cp = inc->i_conn_path; else cp = &conn->c_path[0]; rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " "flags 0x%x rx_jiffies %lu\n", conn, (unsigned long long)cp->cp_next_rx_seq, inc, (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), be32_to_cpu(inc->i_hdr.h_len), be16_to_cpu(inc->i_hdr.h_sport), be16_to_cpu(inc->i_hdr.h_dport), inc->i_hdr.h_flags, inc->i_rx_jiffies); /* * Sequence numbers should only increase. Messages get their * sequence number as they're queued in a sending conn. They * can be dropped, though, if the sending socket is closed before * they hit the wire. So sequence numbers can skip forward * under normal operation. They can also drop back in the conn * failover case as previously sent messages are resent down the * new instance of a conn. We drop those, otherwise we have * to assume that the next valid seq does not come after a * hole in the fragment stream. * * The headers don't give us a way to realize if fragments of * a message have been dropped. We assume that frags that arrive * to a flow are part of the current message on the flow that is * being reassembled. This means that senders can't drop messages * from the sending conn until all their frags are sent. * * XXX we could spend more on the wire to get more robust failure * detection, arguably worth it to avoid data corruption. */ if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { rds_stats_inc(s_recv_drop_old_seq); goto out; } cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { if (inc->i_hdr.h_sport == 0) { rdsdebug("ignore ping with 0 sport from %pI6c\n", saddr); goto out; } rds_stats_inc(s_recv_ping); rds_send_pong(cp, inc->i_hdr.h_sport); /* if this is a handshake ping, start multipath if necessary */ if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport), be16_to_cpu(inc->i_hdr.h_dport))) { rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); rds_start_mprds(cp->cp_conn); } goto out; } if (be16_to_cpu(inc->i_hdr.h_dport) == RDS_FLAG_PROBE_PORT && inc->i_hdr.h_sport == 0) { rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); /* if this is a handshake pong, start multipath if necessary */ rds_start_mprds(cp->cp_conn); wake_up(&cp->cp_conn->c_hs_waitq); goto out; } rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if); if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; } /* Process extension headers */ rds_recv_incoming_exthdrs(inc, rs); /* We can be racing with rds_release() which marks the socket dead. */ sk = rds_rs_to_sk(rs); /* serialize with rds_release -> sock_orphan */ write_lock_irqsave(&rs->rs_recv_lock, flags); if (!sock_flag(sk, SOCK_DEAD)) { rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs); rds_stats_inc(s_recv_queued); rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); if (sock_flag(sk, SOCK_RCVTSTAMP)) inc->i_usercopy.rx_tstamp = ktime_get_real(); rds_inc_addref(inc); inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock(); list_add_tail(&inc->i_item, &rs->rs_recv_queue); __rds_wake_sk_sleep(sk); } else { rds_stats_inc(s_recv_drop_dead_sock); } write_unlock_irqrestore(&rs->rs_recv_lock, flags); out: if (rs) rds_sock_put(rs); } EXPORT_SYMBOL_GPL(rds_recv_incoming); /* * be very careful here. This is being called as the condition in * wait_event_*() needs to cope with being called many times. */ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) { unsigned long flags; if (!*inc) { read_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&rs->rs_recv_queue)) { *inc = list_entry(rs->rs_recv_queue.next, struct rds_incoming, i_item); rds_inc_addref(*inc); } read_unlock_irqrestore(&rs->rs_recv_lock, flags); } return *inc != NULL; } static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, int drop) { struct sock *sk = rds_rs_to_sk(rs); int ret = 0; unsigned long flags; struct rds_incoming *to_drop = NULL; write_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&inc->i_item)) { ret = 1; if (drop) { /* XXX make sure this i_conn is reliable */ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); list_del_init(&inc->i_item); to_drop = inc; } } write_unlock_irqrestore(&rs->rs_recv_lock, flags); if (to_drop) rds_inc_put(to_drop); rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); return ret; } /* * Pull errors off the error queue. * If msghdr is NULL, we will just purge the error queue. */ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) { struct rds_notifier *notifier; struct rds_rdma_notify cmsg; unsigned int count = 0, max_messages = ~0U; unsigned long flags; LIST_HEAD(copy); int err = 0; memset(&cmsg, 0, sizeof(cmsg)); /* fill holes with zero */ /* put_cmsg copies to user space and thus may sleep. We can't do this * with rs_lock held, so first grab as many notifications as we can stuff * in the user provided cmsg buffer. We don't try to copy more, to avoid * losing notifications - except when the buffer is so small that it wouldn't * even hold a single notification. Then we give him as much of this single * msg as we can squeeze in, and set MSG_CTRUNC. */ if (msghdr) { max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg)); if (!max_messages) max_messages = 1; } spin_lock_irqsave(&rs->rs_lock, flags); while (!list_empty(&rs->rs_notify_queue) && count < max_messages) { notifier = list_entry(rs->rs_notify_queue.next, struct rds_notifier, n_list); list_move(&notifier->n_list, &copy); count++; } spin_unlock_irqrestore(&rs->rs_lock, flags); if (!count) return 0; while (!list_empty(&copy)) { notifier = list_entry(copy.next, struct rds_notifier, n_list); if (msghdr) { cmsg.user_token = notifier->n_user_token; cmsg.status = notifier->n_status; err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, sizeof(cmsg), &cmsg); if (err) break; } list_del_init(&notifier->n_list); kfree(notifier); } /* If we bailed out because of an error in put_cmsg, * we may be left with one or more notifications that we * didn't process. Return them to the head of the list. */ if (!list_empty(&copy)) { spin_lock_irqsave(&rs->rs_lock, flags); list_splice(&copy, &rs->rs_notify_queue); spin_unlock_irqrestore(&rs->rs_lock, flags); } return err; } /* * Queue a congestion notification */ static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) { uint64_t notify = rs->rs_cong_notify; unsigned long flags; int err; err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE, sizeof(notify), &notify); if (err) return err; spin_lock_irqsave(&rs->rs_lock, flags); rs->rs_cong_notify &= ~notify; spin_unlock_irqrestore(&rs->rs_lock, flags); return 0; } /* * Receive any control messages. */ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, struct rds_sock *rs) { int ret = 0; if (inc->i_usercopy.rdma_cookie) { ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, sizeof(inc->i_usercopy.rdma_cookie), &inc->i_usercopy.rdma_cookie); if (ret) goto out; } if ((inc->i_usercopy.rx_tstamp != 0) && sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { struct __kernel_old_timeval tv = ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp); if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) { ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } else { struct __kernel_sock_timeval sk_tv; sk_tv.tv_sec = tv.tv_sec; sk_tv.tv_usec = tv.tv_usec; ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(sk_tv), &sk_tv); } if (ret) goto out; } if (rs->rs_rx_traces) { struct rds_cmsg_rx_trace t; int i, j; memset(&t, 0, sizeof(t)); inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock(); t.rx_traces = rs->rs_rx_traces; for (i = 0; i < rs->rs_rx_traces; i++) { j = rs->rs_rx_trace[i]; t.rx_trace_pos[i] = j; t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] - inc->i_rx_lat_trace[j]; } ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY, sizeof(t), &t); if (ret) goto out; } out: return ret; } static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg) { struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue; struct rds_msg_zcopy_info *info = NULL; struct rds_zcopy_cookies *done; unsigned long flags; if (!msg->msg_control) return false; if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || msg->msg_controllen < CMSG_SPACE(sizeof(*done))) return false; spin_lock_irqsave(&q->lock, flags); if (!list_empty(&q->zcookie_head)) { info = list_entry(q->zcookie_head.next, struct rds_msg_zcopy_info, rs_zcookie_next); list_del(&info->rs_zcookie_next); } spin_unlock_irqrestore(&q->lock, flags); if (!info) return false; done = &info->zcookies; if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done), done)) { spin_lock_irqsave(&q->lock, flags); list_add(&info->rs_zcookie_next, &q->zcookie_head); spin_unlock_irqrestore(&q->lock, flags); return false; } kfree(info); return true; } int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); long timeo; int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct rds_incoming *inc = NULL; /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ timeo = sock_rcvtimeo(sk, nonblock); rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); if (msg_flags & MSG_OOB) goto out; if (msg_flags & MSG_ERRQUEUE) return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR); while (1) { /* If there are pending notifications, do those - and nothing else */ if (!list_empty(&rs->rs_notify_queue)) { ret = rds_notify_queue_get(rs, msg); break; } if (rs->rs_cong_notify) { ret = rds_notify_cong(rs, msg); break; } if (!rds_next_incoming(rs, &inc)) { if (nonblock) { bool reaped = rds_recvmsg_zcookie(rs, msg); ret = reaped ? 0 : -EAGAIN; break; } timeo = wait_event_interruptible_timeout(*sk_sleep(sk), (!list_empty(&rs->rs_notify_queue) || rs->rs_cong_notify || rds_next_incoming(rs, &inc)), timeo); rdsdebug("recvmsg woke inc %p timeo %ld\n", inc, timeo); if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) continue; ret = timeo; if (ret == 0) ret = -ETIMEDOUT; break; } rdsdebug("copying inc %p from %pI6c:%u to user\n", inc, &inc->i_conn->c_faddr, ntohs(inc->i_hdr.h_sport)); ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); if (ret < 0) break; /* * if the message we just copied isn't at the head of the * recv queue then someone else raced us to return it, try * to get the next message. */ if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { rds_inc_put(inc); inc = NULL; rds_stats_inc(s_recv_deliver_raced); iov_iter_revert(&msg->msg_iter, ret); continue; } if (ret < be32_to_cpu(inc->i_hdr.h_len)) { if (msg_flags & MSG_TRUNC) ret = be32_to_cpu(inc->i_hdr.h_len); msg->msg_flags |= MSG_TRUNC; } if (rds_cmsg_recv(inc, msg, rs)) { ret = -EFAULT; break; } rds_recvmsg_zcookie(rs, msg); rds_stats_inc(s_recv_delivered); if (msg->msg_name) { if (ipv6_addr_v4mapped(&inc->i_saddr)) { sin->sin_family = AF_INET; sin->sin_port = inc->i_hdr.h_sport; sin->sin_addr.s_addr = inc->i_saddr.s6_addr32[3]; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); msg->msg_namelen = sizeof(*sin); } else { sin6->sin6_family = AF_INET6; sin6->sin6_port = inc->i_hdr.h_sport; sin6->sin6_addr = inc->i_saddr; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = rs->rs_bound_scope_id; msg->msg_namelen = sizeof(*sin6); } } break; } if (inc) rds_inc_put(inc); out: return ret; } /* * The socket is being shut down and we're asked to drop messages that were * queued for recvmsg. The caller has unbound the socket so the receive path * won't queue any more incoming fragments or messages on the socket. */ void rds_clear_recv_queue(struct rds_sock *rs) { struct sock *sk = rds_rs_to_sk(rs); struct rds_incoming *inc, *tmp; unsigned long flags; LIST_HEAD(to_drop); write_lock_irqsave(&rs->rs_recv_lock, flags); list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); list_move(&inc->i_item, &to_drop); } write_unlock_irqrestore(&rs->rs_recv_lock, flags); list_for_each_entry_safe(inc, tmp, &to_drop, i_item) { list_del_init(&inc->i_item); rds_inc_put(inc); } } /* * inc->i_saddr isn't used here because it is only set in the receive * path. */ void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, __be32 saddr, __be32 daddr, int flip) { struct rds_info_message minfo; minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); minfo.len = be32_to_cpu(inc->i_hdr.h_len); minfo.tos = inc->i_conn->c_tos; if (flip) { minfo.laddr = daddr; minfo.faddr = saddr; minfo.lport = inc->i_hdr.h_dport; minfo.fport = inc->i_hdr.h_sport; } else { minfo.laddr = saddr; minfo.faddr = daddr; minfo.lport = inc->i_hdr.h_sport; minfo.fport = inc->i_hdr.h_dport; } minfo.flags = 0; rds_info_copy(iter, &minfo, sizeof(minfo)); } #if IS_ENABLED(CONFIG_IPV6) void rds6_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, struct in6_addr *saddr, struct in6_addr *daddr, int flip) { struct rds6_info_message minfo6; minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence); minfo6.len = be32_to_cpu(inc->i_hdr.h_len); minfo6.tos = inc->i_conn->c_tos; if (flip) { minfo6.laddr = *daddr; minfo6.faddr = *saddr; minfo6.lport = inc->i_hdr.h_dport; minfo6.fport = inc->i_hdr.h_sport; } else { minfo6.laddr = *saddr; minfo6.faddr = *daddr; minfo6.lport = inc->i_hdr.h_sport; minfo6.fport = inc->i_hdr.h_dport; } minfo6.flags = 0; rds_info_copy(iter, &minfo6, sizeof(minfo6)); } #endif
33 34 1 48 48 10 10 53 25 15 53 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* * net/tipc/addr.c: TIPC address utility routines * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "addr.h" #include "core.h" bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { if (!domain || (domain == addr)) return true; if (!legacy_format) return false; if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_CLUSTER_MASK)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */ return true; return false; } void tipc_set_node_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); memcpy(tn->node_id, id, NODE_ID_LEN); tipc_nodeid2string(tn->node_id_string, id); tn->trial_addr = hash128to32(id); pr_info("Node identity %s, cluster identity %u\n", tipc_own_id_string(net), tn->net_id); } void tipc_set_node_addr(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); u8 node_id[NODE_ID_LEN] = {0,}; tn->node_addr = addr; if (!tipc_own_id(net)) { sprintf(node_id, "%x", addr); tipc_set_node_id(net, node_id); } tn->trial_addr = addr; tn->addr_trial_end = jiffies; pr_info("Node number set to %u\n", addr); } char *tipc_nodeid2string(char *str, u8 *id) { int i; u8 c; /* Already a string ? */ for (i = 0; i < NODE_ID_LEN; i++) { c = id[i]; if (c >= '0' && c <= '9') continue; if (c >= 'A' && c <= 'Z') continue; if (c >= 'a' && c <= 'z') continue; if (c == '.') continue; if (c == ':') continue; if (c == '_') continue; if (c == '-') continue; if (c == '@') continue; if (c != 0) break; } if (i == NODE_ID_LEN) { memcpy(str, id, NODE_ID_LEN); str[NODE_ID_LEN] = 0; return str; } /* Translate to hex string */ for (i = 0; i < NODE_ID_LEN; i++) sprintf(&str[2 * i], "%02x", id[i]); /* Strip off trailing zeroes */ for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) str[i] = 0; return str; }
166 117 50 163 2 10 156 2 157 157 13 6 6 6 6 6 10 1 9 9 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/in6.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { struct sockaddr_in6 udp6_addr = {}; int err; struct socket *sock = NULL; err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; if (cfg->ipv6_v6only) { err = ip6_sock_set_v6only(sock->sk); if (err < 0) goto error; } if (cfg->bind_ifindex) { err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->local_udp_port; err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, sizeof(udp6_addr)); if (err < 0) goto error; if (cfg->peer_udp_port) { memset(&udp6_addr, 0, sizeof(udp6_addr)); udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->peer_udp_port; err = kernel_connect(sock, (struct sockaddr *)&udp6_addr, sizeof(udp6_addr), 0); } if (err < 0) goto error; udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); *sockp = sock; return 0; error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } *sockp = NULL; return err; } EXPORT_SYMBOL_GPL(udp_sock_create6); int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, __be16 src_port, __be16 dst_port, bool nocheck) { struct udphdr *uh; struct ipv6hdr *ip6h; __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); skb_dst_set(skb, dst); udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); ip6_flow_hdr(ip6h, prio, label); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = ttl; ip6h->daddr = *daddr; ip6h->saddr = *saddr; ip6tunnel_xmit(sk, skb, dev); return 0; } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); /** * udp_tunnel6_dst_lookup - perform route lookup on UDP tunnel * @skb: Packet for which lookup is done * @dev: Tunnel device * @net: Network namespace of tunnel device * @sock: Socket which provides route info * @oif: Index of the output interface * @saddr: Memory to store the src ip address * @key: Tunnel information * @sport: UDP source port * @dport: UDP destination port * @dsfield: The traffic class field * @dst_cache: The dst cache to use for lookup * This function performs a route lookup on a UDP tunnel * * It returns a valid dst pointer and stores src address to be used in * tunnel in param saddr on success, else a pointer encoded error code. */ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, struct socket *sock, int oif, struct in6_addr *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 dsfield, struct dst_cache *dst_cache) { struct dst_entry *dst = NULL; struct flowi6 fl6; #ifdef CONFIG_DST_CACHE if (dst_cache) { dst = dst_cache_get_ip6(dst_cache, saddr); if (dst) return dst; } #endif memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = IPPROTO_UDP; fl6.flowi6_oif = oif; fl6.daddr = key->u.ipv6.dst; fl6.saddr = key->u.ipv6.src; fl6.fl6_sport = sport; fl6.fl6_dport = dport; fl6.flowlabel = ip6_make_flowinfo(dsfield, key->label); dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6, NULL); if (IS_ERR(dst)) { netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); return ERR_PTR(-ENETUNREACH); } if (dst->dev == dev) { /* is this necessary? */ netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); dst_release(dst); return ERR_PTR(-ELOOP); } #ifdef CONFIG_DST_CACHE if (dst_cache) dst_cache_set_ip6(dst_cache, dst, &fl6.saddr); #endif *saddr = fl6.saddr; return dst; } EXPORT_SYMBOL_GPL(udp_tunnel6_dst_lookup); MODULE_DESCRIPTION("IPv6 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL");
2 1 1 1 1 5 21 21 21 22 23 23 272 276 6 6 6 6 6 22 17 5 1 6 6 22 6 17 26 4 22 21 2 22 2 22 22 22 23 1 22 21 11 1 7 3 1 6 44 2 44 2 46 46 16 27 30 25 23 3 1 4 25 26 26 26 1 25 26 19 19 3 3 7 7 15 15 30 30 29 30 28 30 30 30 6 6 6 12 5 7 10 10 10 10 10 9 1 10 13 1 2 10 10 10 40 1 39 5 35 2 36 1 34 28 15 13 6 6 13 13 1 1 10 10 10 1 9 5 5 3 3 1 2 2 4 5 5 21 1 12 8 14 4 3 3 2 1 23 1 22 23 2 21 1 21 1 5 2 3 1 2 1 1 1 1 1 1 1 1 1 1 204 1 3 198 196 14 29 169 93 63 3 12 2 1 3 58 5 53 23 1 1 1 1 2 1 1 1 1 13 2 2 1 1 3 21 20 1 1 18 20 93 93 93 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * AF_SMC protocol family socket handler keeping the AF_INET sock address type * applies to SOCK_STREAM sockets only * offers an alternative communication option for TCP-protocol sockets * applicable with RoCE-cards only * * Initial restrictions: * - support for alternate links postponed * * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> * based on prototype from Frank Blaschka */ #define KMSG_COMPONENT "smc" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/socket.h> #include <linux/workqueue.h> #include <linux/in.h> #include <linux/sched/signal.h> #include <linux/if_vlan.h> #include <linux/rcupdate_wait.h> #include <linux/ctype.h> #include <linux/splice.h> #include <net/sock.h> #include <net/tcp.h> #include <net/smc.h> #include <asm/ioctls.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "smc_netns.h" #include "smc.h" #include "smc_clc.h" #include "smc_llc.h" #include "smc_cdc.h" #include "smc_core.h" #include "smc_ib.h" #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" #include "smc_loopback.h" #include "smc_inet.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server */ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); void *hdr; if (cb_ctx->pos[0]) goto out; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_DUMP_HS_LIMITATION); if (!hdr) return -ENOMEM; if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED, sock_net(skb->sk)->smc.limit_smc_hs)) goto err; genlmsg_end(skb, hdr); cb_ctx->pos[0] = 1; out: return skb->len; err: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info) { sock_net(skb->sk)->smc.limit_smc_hs = true; return 0; } int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info) { sock_net(skb->sk)->smc.limit_smc_hs = false; return 0; } static void smc_set_keepalive(struct sock *sk, int val) { struct smc_sock *smc = smc_sk(sk); smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); } static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct smc_sock *smc; struct sock *child; smc = smc_clcsock_user_data(sk); if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > sk->sk_max_ack_backlog) goto drop; if (sk_acceptq_is_full(&smc->sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; } /* passthrough to original syn recv sock fct */ child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); /* child must not inherit smc or its ops */ if (child) { rcu_assign_sk_user_data(child, NULL); /* v4-mapped sockets don't inherit parent ops. Don't restore. */ if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops) inet_csk(child)->icsk_af_ops = smc->ori_af_ops; } return child; drop: dst_release(dst); tcp_listendrop(sk); return NULL; } static bool smc_hs_congested(const struct sock *sk) { const struct smc_sock *smc; smc = smc_clcsock_user_data(sk); if (!smc) return true; if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) return true; return false; } struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), }; int smc_hash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; head = &h->ht; write_lock_bh(&h->lock); sk_add_node(sk, head); write_unlock_bh(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } void smc_unhash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; write_lock_bh(&h->lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&h->lock); } /* This will be called before user really release sock_lock. So do the * work which we didn't do because of user hold the sock_lock in the * BH context */ void smc_release_cb(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); if (smc->conn.tx_in_release_sock) { smc_tx_pending(&smc->conn); smc->conn.tx_in_release_sock = false; } } struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto); struct proto smc_proto6 = { .name = "SMC6", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto6); static void smc_fback_restore_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = NULL; smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); write_unlock_bh(&clcsk->sk_callback_lock); } static void smc_restore_fallback_changes(struct smc_sock *smc) { if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ smc->clcsock->file->private_data = smc->sk.sk_socket; smc->clcsock->file = NULL; smc_fback_restore_callbacks(smc); } } static int __smc_release(struct smc_sock *smc) { struct sock *sk = &smc->sk; int rc = 0; if (!smc->use_fallback) { rc = smc_close_active(smc); smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { if (sk->sk_state != SMC_CLOSED) { if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) sock_put(sk); /* passive closing */ if (sk->sk_state == SMC_LISTEN) { /* wake up clcsock accept */ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); } smc_restore_fallback_changes(smc); } sk->sk_prot->unhash(sk); if (sk->sk_state == SMC_CLOSED) { if (smc->clcsock) { release_sock(sk); smc_clcsock_release(smc); lock_sock(sk); } if (!smc->use_fallback) smc_conn_free(&smc->conn); } return rc; } int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; int old_state, rc = 0; if (!sk) goto out; sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); old_state = sk->sk_state; /* cleanup for a dangling non-blocking connect */ if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); if (cancel_work_sync(&smc->connect_work)) sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires * sock lock for child sockets again */ lock_sock_nested(sk, SINGLE_DEPTH_NESTING); else lock_sock(sk); if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && !smc->use_fallback) smc_close_active_abort(smc); rc = __smc_release(smc); /* detach socket */ sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ out: return rc; } static void smc_destruct(struct sock *sk) { if (sk->sk_state != SMC_CLOSED) return; if (!sock_flag(sk, SOCK_DEAD)) return; } void smc_sk_init(struct net *net, struct sock *sk, int protocol) { struct smc_sock *smc = smc_sk(sk); sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem)); WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem)); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); smc->limit_smc_hs = net->smc.limit_smc_hs; smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; smc_close_init(smc); } static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, int protocol) { struct proto *prot; struct sock *sk; prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); if (!sk) return NULL; sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ smc_sk_init(net, sk, protocol); return sk; } int smc_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; struct smc_sock *smc; int rc; smc = smc_sk(sk); /* replicate tests from inet_bind(), to be safe wrt. future changes */ rc = -EINVAL; if (addr_len < sizeof(struct sockaddr_in)) goto out; rc = -EAFNOSUPPORT; if (addr->sin_family != AF_INET && addr->sin_family != AF_INET6 && addr->sin_family != AF_UNSPEC) goto out; /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ if (addr->sin_family == AF_UNSPEC && addr->sin_addr.s_addr != htonl(INADDR_ANY)) goto out; lock_sock(sk); /* Check if socket is already active */ rc = -EINVAL; if (sk->sk_state != SMC_INIT || smc->connect_nonblock) goto out_rel; smc->clcsock->sk->sk_reuse = sk->sk_reuse; smc->clcsock->sk->sk_reuseport = sk->sk_reuseport; rc = kernel_bind(smc->clcsock, uaddr, addr_len); out_rel: release_sock(sk); out: return rc; } /* copy only relevant settings and flags of SOL_SOCKET level from smc to * clc socket (since smc is not called for these options from net/core) */ #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ (1UL << SOCK_KEEPOPEN) | \ (1UL << SOCK_LINGER) | \ (1UL << SOCK_BROADCAST) | \ (1UL << SOCK_TIMESTAMP) | \ (1UL << SOCK_DBG) | \ (1UL << SOCK_RCVTSTAMP) | \ (1UL << SOCK_RCVTSTAMPNS) | \ (1UL << SOCK_LOCALROUTE) | \ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ (1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_WIFI_STATUS) | \ (1UL << SOCK_NOFCS) | \ (1UL << SOCK_FILTER_LOCKED) | \ (1UL << SOCK_TSTAMP_NEW)) /* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */ static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, unsigned long mask) { nsk->sk_userlocks = osk->sk_userlocks; if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) nsk->sk_sndbuf = osk->sk_sndbuf; if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) nsk->sk_rcvbuf = osk->sk_rcvbuf; } static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, unsigned long mask) { /* options we don't get control via setsockopt for */ nsk->sk_type = osk->sk_type; nsk->sk_sndtimeo = osk->sk_sndtimeo; nsk->sk_rcvtimeo = osk->sk_rcvtimeo; nsk->sk_mark = READ_ONCE(osk->sk_mark); nsk->sk_priority = READ_ONCE(osk->sk_priority); nsk->sk_rcvlowat = osk->sk_rcvlowat; nsk->sk_bound_dev_if = osk->sk_bound_dev_if; nsk->sk_err = osk->sk_err; nsk->sk_flags &= ~mask; nsk->sk_flags |= osk->sk_flags & mask; smc_adjust_sock_bufsizes(nsk, osk, mask); } static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) { smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); } #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ (1UL << SOCK_KEEPOPEN) | \ (1UL << SOCK_LINGER) | \ (1UL << SOCK_DBG)) /* copy only settings and flags relevant for smc from clc to smc socket */ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) { smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } /* register the new vzalloced sndbuf on all links */ static int smcr_lgr_reg_sndbufs(struct smc_link *link, struct smc_buf_desc *snd_desc) { struct smc_link_group *lgr = link->lgr; int i, rc = 0; if (!snd_desc->is_vm) return -EINVAL; /* protect against parallel smcr_link_reg_buf() */ down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc); if (rc) break; } up_write(&lgr->llc_conf_mutex); return rc; } /* register the new rmb on all links */ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) { struct smc_link_group *lgr = link->lgr; bool do_slow = false; int i, rc = 0; rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (rc) return rc; down_read(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; if (!rmb_desc->is_reg_mr[link->link_idx]) { up_read(&lgr->llc_conf_mutex); goto slow_path; } } /* mr register already */ goto fast_path; slow_path: do_slow = true; /* protect against parallel smc_llc_cli_rkey_exchange() and * parallel smcr_link_reg_buf() */ down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; } fast_path: /* exchange confirm_rkey msg with peer */ rc = smc_llc_do_confirm_rkey(link, rmb_desc); if (rc) { rc = -EFAULT; goto out; } rmb_desc->is_conf_rkey = true; out: do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; struct smc_llc_qentry *qentry; int rc; /* Receive CONFIRM LINK request from server over RoCE fabric. * Increasing the client's timeout by twice as much as the server's * timeout by default can temporarily avoid decline messages of * both sides crossing or colliding */ qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME, SMC_LLC_CONFIRM_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } smc_llc_save_peer_uid(qentry); rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); if (rc) return SMC_CLC_DECL_RMBE_EC; rc = smc_ib_modify_qp_rts(link); if (rc) return SMC_CLC_DECL_ERR_RDYLNK; smc_wr_remember_qp_attr(link); /* reg the sndbuf if it was vzalloced */ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } /* reg the rmb */ if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; /* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); if (link->lgr->max_links > 1) { /* optional 2nd link, receive ADD LINK request from server */ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); if (rc == -EAGAIN) rc = 0; /* no DECLINE received, go with one link */ return rc; } smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); smc_llc_cli_add_link(link, qentry); } return 0; } static bool smc_isascii(char *hostname) { int i; for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) if (!isascii(hostname[i])) return false; return true; } static void smc_conn_save_peer_info_fce(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { struct smc_clc_first_contact_ext *fce; int clc_v2_len; if (clc->hdr.version == SMC_V1 || !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) return; if (smc->conn.lgr->is_smcd) { memcpy(smc->conn.lgr->negotiated_eid, clc->d1.eid, SMC_MAX_EID_LEN); clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, d1); } else { memcpy(smc->conn.lgr->negotiated_eid, clc->r1.eid, SMC_MAX_EID_LEN); clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, r1); } fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc) + clc_v2_len); smc->conn.lgr->peer_os = fce->os_type; smc->conn.lgr->peer_smc_release = fce->release; if (smc_isascii(fce->hostname)) memcpy(smc->conn.lgr->peer_hostname, fce->hostname, SMC_MAX_HOSTNAME_LEN); } static void smcr_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size); smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx; smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token); smc->conn.peer_rmbe_size = bufsize; atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } static void smcd_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size); smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx; smc->conn.peer_token = ntohll(clc->d0.token); /* msg header takes up space in the buffer */ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; } static void smc_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { if (smc->conn.lgr->is_smcd) smcd_conn_save_peer_info(smc, clc); else smcr_conn_save_peer_info(smc, clc); smc_conn_save_peer_info_fce(smc, clc); } static void smc_link_save_peer_info(struct smc_link *link, struct smc_clc_msg_accept_confirm *clc, struct smc_init_info *ini) { link->peer_qpn = ntoh24(clc->r0.qpn); memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE); memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; } static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, struct smc_stats_fback *fback_arr) { int cnt; for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) { if (fback_arr[cnt].fback_code == smc->fallback_rsn) { fback_arr[cnt].count++; break; } if (!fback_arr[cnt].fback_code) { fback_arr[cnt].fback_code = smc->fallback_rsn; fback_arr[cnt].count++; break; } } } static void smc_stat_fallback(struct smc_sock *smc) { struct net *net = sock_net(&smc->sk); mutex_lock(&net->smc.mutex_fback_rsn); if (smc->listen_smc) { smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv); net->smc.fback_rsn->srv_fback_cnt++; } else { smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt); net->smc.fback_rsn->clnt_fback_cnt++; } mutex_unlock(&net->smc.mutex_fback_rsn); } /* must be called under rcu read lock */ static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) { struct socket_wq *wq; __poll_t flags; wq = rcu_dereference(smc->sk.sk_wq); if (!skwq_has_sleeper(wq)) return; /* wake up smc sk->sk_wq */ if (!key) { /* sk_state_change */ wake_up_interruptible_all(&wq->wait); } else { flags = key_to_poll(key); if (flags & (EPOLLIN | EPOLLOUT)) /* sk_data_ready or sk_write_space */ wake_up_interruptible_sync_poll(&wq->wait, flags); else if (flags & EPOLLERR) /* sk_error_report */ wake_up_interruptible_poll(&wq->wait, flags); } } static int smc_fback_mark_woken(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct smc_mark_woken *mark = container_of(wait, struct smc_mark_woken, wait_entry); mark->woken = true; mark->key = key; return 0; } static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, void (*clcsock_callback)(struct sock *sk)) { struct smc_mark_woken mark = { .woken = false }; struct socket_wq *wq; init_waitqueue_func_entry(&mark.wait_entry, smc_fback_mark_woken); rcu_read_lock(); wq = rcu_dereference(clcsk->sk_wq); if (!wq) goto out; add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); clcsock_callback(clcsk); remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); if (mark.woken) smc_fback_wakeup_waitqueue(smc, mark.key); out: rcu_read_unlock(); } static void smc_fback_state_change(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_data_ready(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_write_space(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_error_report(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_replace_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, &smc->clcsk_state_change); smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, &smc->clcsk_data_ready); smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, &smc->clcsk_write_space); smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, &smc->clcsk_error_report); write_unlock_bh(&clcsk->sk_callback_lock); } static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { int rc = 0; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { rc = -EBADF; goto out; } smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); trace_smc_switch_to_fallback(smc, reason_code); if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; smc->sk.sk_socket->wq.fasync_list = NULL; /* There might be some wait entries remaining * in smc sk->sk_wq and they should be woken up * as clcsock's wait queue is woken up. */ smc_fback_replace_callbacks(smc); } out: mutex_unlock(&smc->clcsock_release_lock); return rc; } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { struct net *net = sock_net(&smc->sk); int rc = 0; rc = smc_switch_to_fallback(smc, reason_code); if (rc) { /* fallback fails */ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; } smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; } /* decline and fall back during connect */ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, u8 version) { struct net *net = sock_net(&smc->sk); int rc; if (reason_code < 0) { /* error, fallback is not possible */ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return reason_code; } if (reason_code != SMC_CLC_DECL_PEERDECL) { rc = smc_clc_send_decline(smc, reason_code, version); if (rc < 0) { this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; } } return smc_connect_fallback(smc, reason_code); } static void smc_conn_abort(struct smc_sock *smc, int local_first) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; bool lgr_valid = false; if (smc_conn_lgr_valid(conn)) lgr_valid = true; smc_conn_free(conn); if (local_first && lgr_valid) smc_lgr_cleanup_early(lgr); } /* check if there is a rdma device available for this connection. */ /* called for connect and listen */ static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) { /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ smc_pnet_find_roce_resource(smc->clcsock->sk, ini); if (!ini->check_smcrv2 && !ini->ib_dev) return SMC_CLC_DECL_NOSMCRDEV; if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2) return SMC_CLC_DECL_NOSMCRDEV; return 0; } /* check if there is an ISM device available for this connection. */ /* called for connect and listen */ static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) { /* Find ISM device with same PNETID as connecting interface */ smc_pnet_find_ism_resource(smc->clcsock->sk, ini); if (!ini->ism_dev[0]) return SMC_CLC_DECL_NOSMCDDEV; else ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]); return 0; } /* is chid unique for the ism devices that are already determined? */ static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini, int cnt) { int i = (!ini->ism_dev[0]) ? 1 : 0; for (; i < cnt; i++) if (ini->ism_chid[i] == chid) return false; return true; } /* determine possible V2 ISM devices (either without PNETID or with PNETID plus * PNETID matching net_device) */ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, struct smc_init_info *ini) { int rc = SMC_CLC_DECL_NOSMCDDEV; struct smcd_dev *smcd; int i = 1, entry = 1; bool is_emulated; u16 chid; if (smcd_indicated(ini->smc_type_v1)) rc = 0; /* already initialized for V1 */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->going_away || smcd == ini->ism_dev[0]) continue; chid = smc_ism_get_chid(smcd); if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) continue; is_emulated = __smc_ism_is_emulated(chid); if (!smc_pnet_is_pnetid_set(smcd->pnetid) || smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { if (is_emulated && entry == SMCD_CLC_MAX_V2_GID_ENTRIES) /* It's the last GID-CHID entry left in CLC * Proposal SMC-Dv2 extension, but an Emulated- * ISM device will take two entries. So give * up it and try the next potential ISM device. */ continue; ini->ism_dev[i] = smcd; ini->ism_chid[i] = chid; ini->is_smcd = true; rc = 0; i++; entry = is_emulated ? entry + 2 : entry + 1; if (entry > SMCD_CLC_MAX_V2_GID_ENTRIES) break; } } mutex_unlock(&smcd_dev_list.mutex); ini->ism_offered_cnt = i - 1; if (!ini->ism_dev[0] && !ini->ism_dev[1]) ini->smcd_version = 0; return rc; } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, struct smc_init_info *ini) { if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_ISMVLANERR; return 0; } static int smc_find_proposal_devices(struct smc_sock *smc, struct smc_init_info *ini) { int rc = 0; /* check if there is an ism device available */ if (!(ini->smcd_version & SMC_V1) || smc_find_ism_device(smc, ini) || smc_connect_ism_vlan_setup(smc, ini)) ini->smcd_version &= ~SMC_V1; /* else ISM V1 is supported for this connection */ /* check if there is an rdma device available */ if (!(ini->smcr_version & SMC_V1) || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V1; /* else RDMA is supported for this connection */ ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1, ini->smcr_version & SMC_V1); /* check if there is an ism v2 device available */ if (!(ini->smcd_version & SMC_V2) || !smc_ism_is_v2_capable() || smc_find_ism_v2_device_clnt(smc, ini)) ini->smcd_version &= ~SMC_V2; /* check if there is an rdma v2 device available */ ini->check_smcrv2 = true; ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; if (!(ini->smcr_version & SMC_V2) || #if IS_ENABLED(CONFIG_IPV6) (smc->clcsock->sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&smc->clcsock->sk->sk_v6_rcv_saddr)) || #endif !smc_clc_ueid_count() || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V2; ini->check_smcrv2 = false; ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2, ini->smcr_version & SMC_V2); /* if neither ISM nor RDMA are supported, fallback */ if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) rc = SMC_CLC_DECL_NOSMCDEV; return rc; } /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is * used, the VLAN ID will be registered again during the connection setup. */ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, struct smc_init_info *ini) { if (!smcd_indicated(ini->smc_type_v1)) return 0; if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_CNFERR; return 0; } #define SMC_CLC_MAX_ACCEPT_LEN \ (sizeof(struct smc_clc_msg_accept_confirm) + \ sizeof(struct smc_clc_first_contact_ext_v2x) + \ sizeof(struct smc_clc_msg_trail)) /* CLC handshake during connect */ static int smc_connect_clc(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int rc = 0; /* do inband token exchange */ rc = smc_clc_send_proposal(smc, ini); if (rc) return rc; /* receive SMC Accept CLC message */ return smc_clc_wait_msg(smc, aclc, SMC_CLC_MAX_ACCEPT_LEN, SMC_CLC_ACCEPT, CLC_WAIT_TIME); } void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid) { struct smc_init_info *alt_ini = NULL; memset(gidlist, 0, sizeof(*gidlist)); memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE); alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL); if (!alt_ini) goto out; alt_ini->vlan_id = lgr->vlan_id; alt_ini->check_smcrv2 = true; alt_ini->smcrv2.saddr = lgr->saddr; smc_pnet_find_alt_roce(lgr, alt_ini, known_dev); if (!alt_ini->smcrv2.ib_dev_v2) goto out; memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); out: kfree(alt_ini); } static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { struct smc_clc_first_contact_ext *fce = smc_get_clc_first_contact_ext(aclc, false); struct net *net = sock_net(&smc->sk); int rc; if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) return 0; if (fce->v2_direct) { memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN); ini->smcrv2.uses_gateway = false; } else { if (smc_ib_find_route(net, smc->clcsock->sk->sk_rcv_saddr, smc_ib_gid_to_ipv4(aclc->r0.lcl.gid), ini->smcrv2.nexthop_mac, &ini->smcrv2.uses_gateway)) return SMC_CLC_DECL_NOROUTE; if (!ini->smcrv2.uses_gateway) { /* mismatch: peer claims indirect, but its direct */ return SMC_CLC_DECL_NOINDIRECT; } } ini->release_nr = fce->release; rc = smc_clc_clnt_v2x_features_validate(fce, ini); if (rc) return rc; return 0; } /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int i, reason_code = 0; struct smc_link *link; u8 *eid = NULL; ini->is_smcd = false; ini->ib_clcqpn = ntoh24(aclc->r0.qpn); ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); ini->max_conns = SMC_CONN_PER_LGR_MAX; ini->max_links = SMC_LINKS_ADD_LNK_MAX; reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); if (reason_code) return reason_code; mutex_lock(&smc_client_lgr_pending); reason_code = smc_conn_create(smc, ini); if (reason_code) { mutex_unlock(&smc_client_lgr_pending); return reason_code; } smc_conn_save_peer_info(smc, aclc); if (ini->first_contact_local) { link = smc->conn.lnk; } else { /* set link that was assigned by server */ link = NULL; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *l = &smc->conn.lgr->lnk[i]; if (l->peer_qpn == ntoh24(aclc->r0.qpn) && !memcmp(l->peer_gid, &aclc->r0.lcl.gid, SMC_GID_SIZE) && (aclc->hdr.version > SMC_V1 || !memcmp(l->peer_mac, &aclc->r0.lcl.mac, sizeof(l->peer_mac)))) { link = l; break; } } if (!link) { reason_code = SMC_CLC_DECL_NOSRVLINK; goto connect_abort; } smc_switch_link_and_count(&smc->conn, link); } /* create send buffer and rmb */ if (smc_buf_create(smc, false)) { reason_code = SMC_CLC_DECL_MEM; goto connect_abort; } if (ini->first_contact_local) smc_link_save_peer_info(link, aclc, ini); if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) { reason_code = SMC_CLC_DECL_ERR_RTOK; goto connect_abort; } smc_rx_init(smc); if (ini->first_contact_local) { if (smc_ib_ready_link(link)) { reason_code = SMC_CLC_DECL_ERR_RDYLNK; goto connect_abort; } } else { /* reg sendbufs if they were vzalloced */ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } if (aclc->hdr.version > SMC_V1) { eid = aclc->r1.eid; if (ini->first_contact_local) smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist, link->smcibdev, link->gid); } reason_code = smc_clc_send_confirm(smc, ini->first_contact_local, aclc->hdr.version, eid, ini); if (reason_code) goto connect_abort; smc_tx_init(smc); if (ini->first_contact_local) { /* QP confirmation over RoCE fabric */ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_clnt_conf_first_link(smc); smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) goto connect_abort; } mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; connect_abort: smc_conn_abort(smc, ini->first_contact_local); mutex_unlock(&smc_client_lgr_pending); smc->connect_nonblock = 0; return reason_code; } /* The server has chosen one of the proposed ISM devices for the communication. * Determine from the CHID of the received CLC ACCEPT the ISM device chosen. */ static int smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int i; for (i = 0; i < ini->ism_offered_cnt + 1; i++) { if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { ini->ism_selected = i; return 0; } } return -EPROTO; } /* setup for ISM connection of client */ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { u8 *eid = NULL; int rc = 0; ini->is_smcd = true; ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; if (aclc->hdr.version == SMC_V2) { if (ini->first_contact_peer) { struct smc_clc_first_contact_ext *fce = smc_get_clc_first_contact_ext(aclc, true); ini->release_nr = fce->release; rc = smc_clc_clnt_v2x_features_validate(fce, ini); if (rc) return rc; } rc = smc_v2_determine_accepted_chid(aclc, ini); if (rc) return rc; if (__smc_ism_is_emulated(ini->ism_chid[ini->ism_selected])) ini->ism_peer_gid[ini->ism_selected].gid_ext = ntohll(aclc->d1.gid_ext); /* for non-Emulated-ISM devices, peer gid_ext remains 0. */ } ini->ism_peer_gid[ini->ism_selected].gid = ntohll(aclc->d0.gid); /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); rc = smc_conn_create(smc, ini); if (rc) { mutex_unlock(&smc_server_lgr_pending); return rc; } /* Create send and receive buffers */ rc = smc_buf_create(smc, true); if (rc) { rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; goto connect_abort; } smc_conn_save_peer_info(smc, aclc); if (smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) { rc = smcd_buf_attach(smc); if (rc) { rc = SMC_CLC_DECL_MEM; /* try to fallback */ goto connect_abort; } } smc_rx_init(smc); smc_tx_init(smc); if (aclc->hdr.version > SMC_V1) eid = aclc->d1.eid; rc = smc_clc_send_confirm(smc, ini->first_contact_local, aclc->hdr.version, eid, ini); if (rc) goto connect_abort; mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; connect_abort: smc_conn_abort(smc, ini->first_contact_local); mutex_unlock(&smc_server_lgr_pending); smc->connect_nonblock = 0; return rc; } /* check if received accept type and version matches a proposed one */ static int smc_connect_check_aclc(struct smc_init_info *ini, struct smc_clc_msg_accept_confirm *aclc) { if (aclc->hdr.version >= SMC_V2) { if ((aclc->hdr.typev1 == SMC_TYPE_R && !smcr_indicated(ini->smc_type_v2)) || (aclc->hdr.typev1 == SMC_TYPE_D && !smcd_indicated(ini->smc_type_v2))) return SMC_CLC_DECL_MODEUNSUPP; } else { if ((aclc->hdr.typev1 == SMC_TYPE_R && !smcr_indicated(ini->smc_type_v1)) || (aclc->hdr.typev1 == SMC_TYPE_D && !smcd_indicated(ini->smc_type_v1))) return SMC_CLC_DECL_MODEUNSUPP; } return 0; } /* perform steps before actually connecting */ static int __smc_connect(struct smc_sock *smc) { u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; struct smc_clc_msg_accept_confirm *aclc; struct smc_init_info *ini = NULL; u8 *buf = NULL; int rc = 0; if (smc->use_fallback) return smc_connect_fallback(smc, smc->fallback_rsn); /* if peer has not signalled SMC-capability, fall back */ if (!tcp_sk(smc->clcsock->sk)->syn_smc) return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC); /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(smc)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC, version); ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM, version); ini->smcd_version = SMC_V1 | SMC_V2; ini->smcr_version = SMC_V1 | SMC_V2; ini->smc_type_v1 = SMC_TYPE_B; ini->smc_type_v2 = SMC_TYPE_B; /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { ini->smcd_version &= ~SMC_V1; ini->smcr_version = 0; ini->smc_type_v1 = SMC_TYPE_N; } rc = smc_find_proposal_devices(smc, ini); if (rc) goto fallback; buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL); if (!buf) { rc = SMC_CLC_DECL_MEM; goto fallback; } aclc = (struct smc_clc_msg_accept_confirm *)buf; /* perform CLC handshake */ rc = smc_connect_clc(smc, aclc, ini); if (rc) { /* -EAGAIN on timeout, see tcp_recvmsg() */ if (rc == -EAGAIN) { rc = -ETIMEDOUT; smc->sk.sk_err = ETIMEDOUT; } goto vlan_cleanup; } /* check if smc modes and versions of CLC proposal and accept match */ rc = smc_connect_check_aclc(ini, aclc); version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; if (rc) goto vlan_cleanup; /* depending on previous steps, connect using rdma or ism */ if (aclc->hdr.typev1 == SMC_TYPE_R) { ini->smcr_version = version; rc = smc_connect_rdma(smc, aclc, ini); } else if (aclc->hdr.typev1 == SMC_TYPE_D) { ini->smcd_version = version; rc = smc_connect_ism(smc, aclc, ini); } if (rc) goto vlan_cleanup; SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); smc_connect_ism_vlan_cleanup(smc, ini); kfree(buf); kfree(ini); return 0; vlan_cleanup: smc_connect_ism_vlan_cleanup(smc, ini); kfree(buf); fallback: kfree(ini); return smc_connect_decline_fallback(smc, rc, version); } static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); long timeo = smc->sk.sk_sndtimeo; int rc = 0; if (!timeo) timeo = MAX_SCHEDULE_TIMEOUT; lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; } else if ((1 << smc->clcsock->sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); if ((rc == -EPIPE) && ((1 << smc->clcsock->sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))) rc = 0; } release_sock(smc->clcsock->sk); lock_sock(&smc->sk); if (rc != 0 || smc->sk.sk_err) { smc->sk.sk_state = SMC_CLOSED; if (rc == -EPIPE || rc == -EAGAIN) smc->sk.sk_err = EPIPE; else if (rc == -ECONNREFUSED) smc->sk.sk_err = ECONNREFUSED; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); sock_put(&smc->sk); /* passive closing */ goto out; } rc = __smc_connect(smc); if (rc < 0) smc->sk.sk_err = -rc; out: if (!sock_flag(&smc->sk, SOCK_DEAD)) { if (smc->sk.sk_err) { smc->sk.sk_state_change(&smc->sk); } else { /* allow polling before and after fallback decision */ smc->clcsock->sk->sk_write_space(smc->clcsock->sk); smc->sk.sk_write_space(&smc->sk); } } release_sock(&smc->sk); } int smc_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc = -EINVAL; smc = smc_sk(sk); /* separate smc parameter checking to be safe */ if (alen < sizeof(addr->sa_family)) goto out_err; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) goto out_err; lock_sock(sk); switch (sock->state) { default: rc = -EINVAL; goto out; case SS_CONNECTED: rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL; goto out; case SS_CONNECTING: if (sk->sk_state == SMC_ACTIVE) goto connected; break; case SS_UNCONNECTED: sock->state = SS_CONNECTING; break; } switch (sk->sk_state) { default: goto out; case SMC_CLOSED: rc = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; goto out; case SMC_ACTIVE: rc = -EISCONN; goto out; case SMC_INIT: break; } smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; if (smc->connect_nonblock) { rc = -EALREADY; goto out; } rc = kernel_connect(smc->clcsock, addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; if (smc->use_fallback) { sock->state = rc ? SS_CONNECTING : SS_CONNECTED; goto out; } sock_hold(&smc->sk); /* sock put in passive closing */ if (flags & O_NONBLOCK) { if (queue_work(smc_hs_wq, &smc->connect_work)) smc->connect_nonblock = 1; rc = -EINPROGRESS; goto out; } else { rc = __smc_connect(smc); if (rc < 0) goto out; } connected: rc = 0; sock->state = SS_CONNECTED; out: release_sock(sk); out_err: return rc; } static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) { struct socket *new_clcsock = NULL; struct sock *lsk = &lsmc->sk; struct sock *new_sk; int rc = -EINVAL; release_sock(lsk); new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); if (!new_sk) { rc = -ENOMEM; lsk->sk_err = ENOMEM; *new_smc = NULL; lock_sock(lsk); goto out; } *new_smc = smc_sk(new_sk); mutex_lock(&lsmc->clcsock_release_lock); if (lsmc->clcsock) rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); mutex_unlock(&lsmc->clcsock_release_lock); lock_sock(lsk); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; smc_sock_set_flag(new_sk, SOCK_DEAD); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; } /* new clcsock has inherited the smc listen-specific sk_data_ready * function; switch it back to the original sk_data_ready function */ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; /* if new clcsock has also inherited the fallback-specific callback * functions, switch them back to the original ones. */ if (lsmc->use_fallback) { if (lsmc->clcsk_state_change) new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; if (lsmc->clcsk_write_space) new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; if (lsmc->clcsk_error_report) new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; } (*new_smc)->clcsock = new_clcsock; out: return rc; } /* add a just created sock to the accept queue of the listen sock as * candidate for a following socket accept call from user space */ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) { struct smc_sock *par = smc_sk(parent); sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); spin_unlock(&par->accept_q_lock); sk_acceptq_added(parent); } /* remove a socket from the accept queue of its parental listening socket */ static void smc_accept_unlink(struct sock *sk) { struct smc_sock *par = smc_sk(sk)->listen_smc; spin_lock(&par->accept_q_lock); list_del_init(&smc_sk(sk)->accept_q); spin_unlock(&par->accept_q_lock); sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); sock_put(sk); /* sock_hold in smc_accept_enqueue */ } /* remove a sock from the accept queue to bind it to a new socket created * for a socket accept call from user space */ struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock) { struct smc_sock *isk, *n; struct sock *new_sk; list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { new_sk = (struct sock *)isk; smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); if (isk->clcsock) { sock_release(isk->clcsock); isk->clcsock = NULL; } sock_put(new_sk); /* final */ continue; } if (new_sock) { sock_graft(new_sk, new_sock); new_sock->state = SS_CONNECTED; if (isk->use_fallback) { smc_sk(new_sk)->clcsock->file = new_sock->file; isk->clcsock->file->private_data = isk->clcsock; } } return new_sk; } return NULL; } /* clean up for a created but never accepted sock */ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); sock_hold(sk); /* sock_put below */ lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT); __smc_release(smc); release_sock(sk); sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ } static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; struct smc_llc_qentry *qentry; int rc; /* reg the sndbuf if it was vzalloced*/ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } /* reg the rmb */ if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; /* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_CONFIRM_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } smc_llc_save_peer_uid(qentry); rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); if (rc) return SMC_CLC_DECL_RMBE_EC; /* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); if (link->lgr->max_links > 1) { down_write(&link->lgr->llc_conf_mutex); /* initial contact - try to establish second link */ smc_llc_srv_add_link(link, NULL); up_write(&link->lgr->llc_conf_mutex); } return 0; } /* listen worker: finish */ static void smc_listen_out(struct smc_sock *new_smc) { struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); release_sock(newsmcsk); /* lock in smc_listen_work() */ if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); release_sock(&lsmc->sk); } else { /* no longer listening */ smc_close_non_accepted(newsmcsk); } /* Wake up accept */ lsmc->sk.sk_data_ready(&lsmc->sk); sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ } /* listen worker: finish in state connected */ static void smc_listen_out_connected(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; smc_listen_out(new_smc); } /* listen worker: finish in error state */ static void smc_listen_out_err(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; struct net *net = sock_net(newsmcsk); this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt); if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; smc_listen_out(new_smc); } /* listen worker: decline and fall back if possible */ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, int local_first, u8 version) { /* RDMA setup failed, switch back to TCP */ smc_conn_abort(new_smc, local_first); if (reason_code < 0 || smc_switch_to_fallback(new_smc, reason_code)) { /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); return; } } smc_listen_out_connected(new_smc); } /* listen worker: version checking */ static int smc_listen_v2_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext; struct smc_clc_v2_extension *pclc_v2_ext; int rc = SMC_CLC_DECL_PEERNOSMC; ini->smc_type_v1 = pclc->hdr.typev1; ini->smc_type_v2 = pclc->hdr.typev2; ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0; ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0; if (pclc->hdr.version > SMC_V1) { if (smcd_indicated(ini->smc_type_v2)) ini->smcd_version |= SMC_V2; if (smcr_indicated(ini->smc_type_v2)) ini->smcr_version |= SMC_V2; } if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) { rc = SMC_CLC_DECL_PEERNOSMC; goto out; } pclc_v2_ext = smc_get_clc_v2_ext(pclc); if (!pclc_v2_ext) { ini->smcd_version &= ~SMC_V2; ini->smcr_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOV2EXT; goto out; } pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); if (ini->smcd_version & SMC_V2) { if (!smc_ism_is_v2_capable()) { ini->smcd_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOISM2SUPP; } else if (!pclc_smcd_v2_ext) { ini->smcd_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOV2DEXT; } else if (!pclc_v2_ext->hdr.eid_cnt && !pclc_v2_ext->hdr.flag.seid) { ini->smcd_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOUEID; } } if (ini->smcr_version & SMC_V2) { if (!pclc_v2_ext->hdr.eid_cnt) { ini->smcr_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOUEID; } } ini->release_nr = pclc_v2_ext->hdr.flag.release; if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE) ini->release_nr = SMC_RELEASE; out: if (!ini->smcd_version && !ini->smcr_version) return rc; return 0; } /* listen worker: check prefixes */ static int smc_listen_prfx_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc) { struct smc_clc_msg_proposal_prefix *pclc_prfx; struct socket *newclcsock = new_smc->clcsock; if (pclc->hdr.typev1 == SMC_TYPE_N) return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (!pclc_prfx) return -EPROTO; if (smc_clc_prfx_match(newclcsock, pclc_prfx)) return SMC_CLC_DECL_DIFFPREFIX; return 0; } /* listen worker: initialize connection and buffers */ static int smc_listen_rdma_init(struct smc_sock *new_smc, struct smc_init_info *ini) { int rc; /* allocate connection / link group */ rc = smc_conn_create(new_smc, ini); if (rc) return rc; /* create send buffer and rmb */ if (smc_buf_create(new_smc, false)) { smc_conn_abort(new_smc, ini->first_contact_local); return SMC_CLC_DECL_MEM; } return 0; } /* listen worker: initialize connection and buffers for SMC-D */ static int smc_listen_ism_init(struct smc_sock *new_smc, struct smc_init_info *ini) { int rc; rc = smc_conn_create(new_smc, ini); if (rc) return rc; /* Create send and receive buffers */ rc = smc_buf_create(new_smc, true); if (rc) { smc_conn_abort(new_smc, ini->first_contact_local); return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; } return 0; } static bool smc_is_already_selected(struct smcd_dev *smcd, struct smc_init_info *ini, int matches) { int i; for (i = 0; i < matches; i++) if (smcd == ini->ism_dev[i]) return true; return false; } /* check for ISM devices matching proposed ISM devices */ static void smc_check_ism_v2_match(struct smc_init_info *ini, u16 proposed_chid, struct smcd_gid *proposed_gid, unsigned int *matches) { struct smcd_dev *smcd; list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->going_away) continue; if (smc_is_already_selected(smcd, ini, *matches)) continue; if (smc_ism_get_chid(smcd) == proposed_chid && !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { ini->ism_peer_gid[*matches].gid = proposed_gid->gid; if (__smc_ism_is_emulated(proposed_chid)) ini->ism_peer_gid[*matches].gid_ext = proposed_gid->gid_ext; /* non-Emulated-ISM's peer gid_ext remains 0. */ ini->ism_dev[*matches] = smcd; (*matches)++; break; } } } static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini) { if (!ini->rc) ini->rc = rc; } static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_smcd_v2_extension *smcd_v2_ext; struct smc_clc_v2_extension *smc_v2_ext; struct smc_clc_msg_smcd *pclc_smcd; unsigned int matches = 0; struct smcd_gid smcd_gid; u8 smcd_version; u8 *eid = NULL; int i, rc; u16 chid; if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) goto not_found; pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); if (!pclc_smcd || !smc_v2_ext || !smcd_v2_ext) goto not_found; mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) { /* check for ISM device matching proposed native ISM device */ smcd_gid.gid = ntohll(pclc_smcd->ism.gid); smcd_gid.gid_ext = 0; smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid), &smcd_gid, &matches); } for (i = 0; i < smc_v2_ext->hdr.ism_gid_cnt; i++) { /* check for ISM devices matching proposed non-native ISM * devices */ smcd_gid.gid = ntohll(smcd_v2_ext->gidchid[i].gid); smcd_gid.gid_ext = 0; chid = ntohs(smcd_v2_ext->gidchid[i].chid); if (__smc_ism_is_emulated(chid)) { if ((i + 1) == smc_v2_ext->hdr.ism_gid_cnt || chid != ntohs(smcd_v2_ext->gidchid[i + 1].chid)) /* each Emulated-ISM device takes two GID-CHID * entries and CHID of the second entry repeats * that of the first entry. * * So check if the next GID-CHID entry exists * and both two entries' CHIDs are the same. */ continue; smcd_gid.gid_ext = ntohll(smcd_v2_ext->gidchid[++i].gid); } smc_check_ism_v2_match(ini, chid, &smcd_gid, &matches); } mutex_unlock(&smcd_dev_list.mutex); if (!ini->ism_dev[0]) { smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); goto not_found; } smc_ism_get_system_eid(&eid); if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, smcd_v2_ext->system_eid, eid)) goto not_found; /* separate - outside the smcd_dev_list.lock */ smcd_version = ini->smcd_version; for (i = 0; i < matches; i++) { ini->smcd_version = SMC_V2; ini->is_smcd = true; ini->ism_selected = i; rc = smc_listen_ism_init(new_smc, ini); if (rc) { smc_find_ism_store_rc(rc, ini); /* try next active ISM device */ continue; } return; /* matching and usable V2 ISM device found */ } /* no V2 ISM device could be initialized */ ini->smcd_version = smcd_version; /* restore original value */ ini->negotiated_eid[0] = 0; not_found: ini->smcd_version &= ~SMC_V2; ini->ism_dev[0] = NULL; ini->is_smcd = false; } static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc); int rc = 0; /* check if ISM V1 is available */ if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1) || !pclc_smcd) goto not_found; ini->is_smcd = true; /* prepare ISM check */ ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid); ini->ism_peer_gid[0].gid_ext = 0; rc = smc_find_ism_device(new_smc, ini); if (rc) goto not_found; ini->ism_selected = 0; rc = smc_listen_ism_init(new_smc, ini); if (!rc) return; /* V1 ISM device found */ not_found: smc_find_ism_store_rc(rc, ini); ini->smcd_version &= ~SMC_V1; ini->ism_dev[0] = NULL; ini->is_smcd = false; } /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) { struct smc_connection *conn = &new_smc->conn; if (!local_first) { /* reg sendbufs if they were vzalloced */ if (conn->sndbuf_desc->is_vm) { if (smcr_lgr_reg_sndbufs(conn->lnk, conn->sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; } return 0; } static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_v2_extension *smc_v2_ext; u8 smcr_version; int rc; if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2)) goto not_found; smc_v2_ext = smc_get_clc_v2_ext(pclc); if (!smc_v2_ext || !smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) goto not_found; /* prepare RDMA check */ memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE); memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); ini->check_smcrv2 = true; ini->smcrv2.clc_sk = new_smc->clcsock->sk; ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr; ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); rc = smc_find_rdma_device(new_smc, ini); if (rc) { smc_find_ism_store_rc(rc, ini); goto not_found; } if (!ini->smcrv2.uses_gateway) memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN); smcr_version = ini->smcr_version; ini->smcr_version = SMC_V2; rc = smc_listen_rdma_init(new_smc, ini); if (!rc) { rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); if (rc) smc_conn_abort(new_smc, ini->first_contact_local); } if (!rc) return; ini->smcr_version = smcr_version; smc_find_ism_store_rc(rc, ini); not_found: ini->smcr_version &= ~SMC_V2; ini->smcrv2.ib_dev_v2 = NULL; ini->check_smcrv2 = false; } static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { int rc; if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1)) return SMC_CLC_DECL_NOSMCDEV; /* prepare RDMA check */ memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE); memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); rc = smc_find_rdma_device(new_smc, ini); if (rc) { /* no RDMA device found */ return SMC_CLC_DECL_NOSMCDEV; } rc = smc_listen_rdma_init(new_smc, ini); if (rc) return rc; return smc_listen_rdma_reg(new_smc, ini->first_contact_local); } /* determine the local device matching to proposal */ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { int prfx_rc; /* check for ISM device matching V2 proposed device */ smc_find_ism_v2_device_serv(new_smc, pclc, ini); if (ini->ism_dev[0]) return 0; /* check for matching IP prefix and subnet length (V1) */ prfx_rc = smc_listen_prfx_check(new_smc, pclc); if (prfx_rc) smc_find_ism_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) return ini->rc ?: SMC_CLC_DECL_GETVLANERR; /* check for ISM device matching V1 proposed device */ if (!prfx_rc) smc_find_ism_v1_device_serv(new_smc, pclc, ini); if (ini->ism_dev[0]) return 0; if (!smcr_indicated(pclc->hdr.typev1) && !smcr_indicated(pclc->hdr.typev2)) /* skip RDMA and decline */ return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; /* check if RDMA V2 is available */ smc_find_rdma_v2_device_serv(new_smc, pclc, ini); if (ini->smcrv2.ib_dev_v2) return 0; /* check if RDMA V1 is available */ if (!prfx_rc) { int rc; rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); smc_find_ism_store_rc(rc, ini); return (!rc) ? 0 : ini->rc; } return prfx_rc; } /* listen worker: finish RDMA setup */ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, bool local_first, struct smc_init_info *ini) { struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; if (local_first) smc_link_save_peer_info(link, cclc, ini); if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) return SMC_CLC_DECL_ERR_RTOK; if (local_first) { if (smc_ib_ready_link(link)) return SMC_CLC_DECL_ERR_RDYLNK; /* QP confirmation over RoCE fabric */ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_serv_conf_first_link(new_smc); smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); } return reason_code; } /* setup for connection of server */ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm *cclc; struct smc_clc_msg_proposal_area *buf; struct smc_clc_msg_proposal *pclc; struct smc_init_info *ini = NULL; u8 proposal_version = SMC_V1; u8 accept_version; int rc = 0; lock_sock(&new_smc->sk); /* release in smc_listen_out() */ if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) return smc_listen_out_err(new_smc); if (new_smc->use_fallback) { smc_listen_out_connected(new_smc); return; } /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); if (rc) smc_listen_out_err(new_smc); else smc_listen_out_connected(new_smc); return; } /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) { rc = SMC_CLC_DECL_MEM; goto out_decl; } pclc = (struct smc_clc_msg_proposal *)buf; rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf), SMC_CLC_PROPOSAL, CLC_WAIT_TIME); if (rc) goto out_decl; if (pclc->hdr.version > SMC_V1) proposal_version = SMC_V2; /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(new_smc)) { rc = SMC_CLC_DECL_IPSEC; goto out_decl; } ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) { rc = SMC_CLC_DECL_MEM; goto out_decl; } /* initial version checking */ rc = smc_listen_v2_check(new_smc, pclc, ini); if (rc) goto out_decl; rc = smc_clc_srv_v2x_features_validate(new_smc, pclc, ini); if (rc) goto out_decl; mutex_lock(&smc_server_lgr_pending); smc_rx_init(new_smc); smc_tx_init(new_smc); /* determine ISM or RoCE device used for connection */ rc = smc_listen_find_device(new_smc, pclc, ini); if (rc) goto out_unlock; /* send SMC Accept CLC message */ accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; rc = smc_clc_send_accept(new_smc, ini->first_contact_local, accept_version, ini->negotiated_eid, ini); if (rc) goto out_unlock; /* SMC-D does not need this lock any more */ if (ini->is_smcd) mutex_unlock(&smc_server_lgr_pending); /* receive SMC Confirm CLC message */ memset(buf, 0, sizeof(*buf)); cclc = (struct smc_clc_msg_accept_confirm *)buf; rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf), SMC_CLC_CONFIRM, CLC_WAIT_TIME); if (rc) { if (!ini->is_smcd) goto out_unlock; goto out_decl; } rc = smc_clc_v2x_features_confirm_check(cclc, ini); if (rc) { if (!ini->is_smcd) goto out_unlock; goto out_decl; } /* fce smc release version is needed in smc_listen_rdma_finish, * so save fce info here. */ smc_conn_save_peer_info_fce(new_smc, cclc); /* finish worker */ if (!ini->is_smcd) { rc = smc_listen_rdma_finish(new_smc, cclc, ini->first_contact_local, ini); if (rc) goto out_unlock; mutex_unlock(&smc_server_lgr_pending); } smc_conn_save_peer_info(new_smc, cclc); if (ini->is_smcd && smc_ism_support_dmb_nocopy(new_smc->conn.lgr->smcd)) { rc = smcd_buf_attach(new_smc); if (rc) goto out_decl; } smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); goto out_free; out_unlock: mutex_unlock(&smc_server_lgr_pending); out_decl: smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, proposal_version); out_free: kfree(ini); kfree(buf); } static void smc_tcp_listen_work(struct work_struct *work) { struct smc_sock *lsmc = container_of(work, struct smc_sock, tcp_listen_work); struct sock *lsk = &lsmc->sk; struct smc_sock *new_smc; int rc = 0; lock_sock(lsk); while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); if (rc) /* clcsock accept queue empty or error */ goto out; if (!new_smc) continue; if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_inc(&lsmc->queued_smc_hs); new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); sock_hold(&new_smc->sk); /* sock_put in passive closing */ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); } out: release_sock(lsk); sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } static void smc_clcsock_data_ready(struct sock *listen_clcsock) { struct smc_sock *lsmc; read_lock_bh(&listen_clcsock->sk_callback_lock); lsmc = smc_clcsock_user_data(listen_clcsock); if (!lsmc) goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) sock_put(&lsmc->sk); } out: read_unlock_bh(&listen_clcsock->sk_callback_lock); } int smc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc; smc = smc_sk(sk); lock_sock(sk); rc = -EINVAL; if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || smc->connect_nonblock || sock->state != SS_UNCONNECTED) goto out; rc = 0; if (sk->sk_state == SMC_LISTEN) { sk->sk_max_ack_backlog = backlog; goto out; } /* some socket options are handled in core, so we could not apply * them to the clc socket -- copy smc socket options to clc socket */ smc_copy_sock_settings_to_clc(smc); if (!smc->use_fallback) tcp_sk(smc->clcsock->sk)->syn_smc = 1; /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, smc_clcsock_data_ready, &smc->clcsk_data_ready); write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); /* save original ops */ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; smc->af_ops = *smc->ori_af_ops; smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock; inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; if (smc->limit_smc_hs) tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; rc = kernel_listen(smc->clcsock, backlog); if (rc) { write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; out: release_sock(sk); return rc; } int smc_accept(struct socket *sock, struct socket *new_sock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *nsk; DECLARE_WAITQUEUE(wait, current); struct smc_sock *lsmc; long timeo; int rc = 0; lsmc = smc_sk(sk); sock_hold(sk); /* sock_put below */ lock_sock(sk); if (lsmc->sk.sk_state != SMC_LISTEN) { rc = -EINVAL; release_sock(sk); goto out; } /* Wait for an incoming connection */ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { rc = -EAGAIN; break; } release_sock(sk); timeo = schedule_timeout(timeo); /* wakeup by sk_data_ready in smc_listen_work() */ sched_annotate_sleep(); lock_sock(sk); if (signal_pending(current)) { rc = sock_intr_errno(timeo); break; } } set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); if (!rc) rc = sock_error(nsk); release_sock(sk); if (rc) goto out; if (lsmc->sockopt_defer_accept && !(arg->flags & O_NONBLOCK)) { /* wait till data arrives on the socket */ timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * MSEC_PER_SEC); if (smc_sk(nsk)->use_fallback) { struct sock *clcsk = smc_sk(nsk)->clcsock->sk; lock_sock(clcsk); if (skb_queue_empty(&clcsk->sk_receive_queue)) sk_wait_data(clcsk, &timeo, NULL); release_sock(clcsk); } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { lock_sock(nsk); smc_rx_wait(smc_sk(nsk), &timeo, 0, smc_rx_data_available); release_sock(nsk); } } out: sock_put(sk); /* sock_hold above */ return rc; } int smc_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct smc_sock *smc; if (peer && (sock->sk->sk_state != SMC_ACTIVE) && (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) return -ENOTCONN; smc = smc_sk(sock->sk); return smc->clcsock->ops->getname(smc->clcsock, addr, peer); } int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc; smc = smc_sk(sk); lock_sock(sk); /* SMC does not support connect with fastopen */ if (msg->msg_flags & MSG_FASTOPEN) { /* not connected yet, fallback */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); if (rc) goto out; } else { rc = -EINVAL; goto out; } } else if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_INIT)) { rc = -EPIPE; goto out; } if (smc->use_fallback) { rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); } else { rc = smc_tx_sendmsg(smc, msg, len); SMC_STAT_TX_PAYLOAD(smc, len, rc); } out: release_sock(sk); return rc; } int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc = -ENOTCONN; smc = smc_sk(sk); lock_sock(sk); if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { /* socket was connected before, no more data to read */ rc = 0; goto out; } if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) || (sk->sk_state == SMC_CLOSED)) goto out; if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { rc = 0; goto out; } if (smc->use_fallback) { rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); } else { msg->msg_namelen = 0; rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); SMC_STAT_RX_PAYLOAD(smc, rc, rc); } out: release_sock(sk); return rc; } static __poll_t smc_accept_poll(struct sock *parent) { struct smc_sock *isk = smc_sk(parent); __poll_t mask = 0; spin_lock(&isk->accept_q_lock); if (!list_empty(&isk->accept_q)) mask = EPOLLIN | EPOLLRDNORM; spin_unlock(&isk->accept_q_lock); return mask; } __poll_t smc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct smc_sock *smc; __poll_t mask = 0; if (!sk) return EPOLLNVAL; smc = smc_sk(sock->sk); if (smc->use_fallback) { /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; } else { if (sk->sk_state != SMC_CLOSED) sock_poll_wait(file, sock, wait); if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) mask |= EPOLLHUP; if (sk->sk_state == SMC_LISTEN) { /* woken up by sk_data_ready in smc_listen_work() */ mask |= smc_accept_poll(sk); } else if (smc->use_fallback) { /* as result of connect_work()*/ mask |= smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; } else { if ((sk->sk_state != SMC_INIT && atomic_read(&smc->conn.sndbuf_space)) || sk->sk_shutdown & SEND_SHUTDOWN) { mask |= EPOLLOUT | EPOLLWRNORM; } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (sk->sk_state != SMC_INIT) { /* Race breaker the same way as tcp_poll(). */ smp_mb__after_atomic(); if (atomic_read(&smc->conn.sndbuf_space)) mask |= EPOLLOUT | EPOLLWRNORM; } } if (atomic_read(&smc->conn.bytes_to_rcv)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (sk->sk_state == SMC_APPCLOSEWAIT1) mask |= EPOLLIN; if (smc->conn.urg_state == SMC_URG_VALID) mask |= EPOLLPRI; } } return mask; } int smc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; bool do_shutdown = true; struct smc_sock *smc; int rc = -EINVAL; int old_state; int rc1 = 0; smc = smc_sk(sk); if ((how < SHUT_RD) || (how > SHUT_RDWR)) return rc; lock_sock(sk); if (sock->state == SS_CONNECTING) { if (sk->sk_state == SMC_ACTIVE) sock->state = SS_CONNECTED; else if (sk->sk_state == SMC_PEERCLOSEWAIT1 || sk->sk_state == SMC_PEERCLOSEWAIT2 || sk->sk_state == SMC_APPCLOSEWAIT1 || sk->sk_state == SMC_APPCLOSEWAIT2 || sk->sk_state == SMC_APPFINCLOSEWAIT) sock->state = SS_DISCONNECTING; } rc = -ENOTCONN; if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_PEERCLOSEWAIT1) && (sk->sk_state != SMC_PEERCLOSEWAIT2) && (sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_APPCLOSEWAIT2) && (sk->sk_state != SMC_APPFINCLOSEWAIT)) goto out; if (smc->use_fallback) { rc = kernel_sock_shutdown(smc->clcsock, how); sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; sk->sk_socket->state = SS_UNCONNECTED; sock_put(sk); } goto out; } switch (how) { case SHUT_RDWR: /* shutdown in both directions */ old_state = sk->sk_state; rc = smc_close_active(smc); if (old_state == SMC_ACTIVE && sk->sk_state == SMC_PEERCLOSEWAIT1) do_shutdown = false; break; case SHUT_WR: rc = smc_close_shutdown_write(smc); break; case SHUT_RD: rc = 0; /* nothing more to do because peer is not involved */ break; } if (do_shutdown && smc->clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; if (sk->sk_state == SMC_CLOSED) sock->state = SS_UNCONNECTED; else sock->state = SS_DISCONNECTING; out: release_sock(sk); return rc ? rc : rc1; } static int __smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; int val, len; smc = smc_sk(sock->sk); if (get_user(len, optlen)) return -EFAULT; len = min_t(int, len, sizeof(int)); if (len < 0) return -EINVAL; switch (optname) { case SMC_LIMIT_HS: val = smc->limit_smc_hs; break; default: return -EOPNOTSUPP; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int __smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; int val, rc; smc = smc_sk(sk); lock_sock(sk); switch (optname) { case SMC_LIMIT_HS: if (optlen < sizeof(int)) { rc = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(int))) { rc = -EFAULT; break; } smc->limit_smc_hs = !!val; rc = 0; break; default: rc = -EOPNOTSUPP; break; } release_sock(sk); return rc; } int smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; int val, rc; if (level == SOL_TCP && optname == TCP_ULP) return -EOPNOTSUPP; else if (level == SOL_SMC) return __smc_setsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sk); /* generic setsockopts reaching us here always apply to the * CLC socket */ mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } if (unlikely(!smc->clcsock->ops->setsockopt)) rc = -EOPNOTSUPP; else rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, optval, optlen); if (smc->clcsock->sk->sk_err) { sk->sk_err = smc->clcsock->sk->sk_err; sk_error_report(sk); } mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); if (rc || smc->use_fallback) goto out; switch (optname) { case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } break; case TCP_NODELAY: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { if (val) { SMC_STAT_INC(smc, ndly_cnt); smc_tx_pending(&smc->conn); cancel_delayed_work(&smc->conn.tx_work); } } break; case TCP_CORK: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { if (!val) { SMC_STAT_INC(smc, cork_cnt); smc_tx_pending(&smc->conn); cancel_delayed_work(&smc->conn.tx_work); } } break; case TCP_DEFER_ACCEPT: smc->sockopt_defer_accept = val; break; default: break; } out: release_sock(sk); return rc; } int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; int rc; if (level == SOL_SMC) return __smc_getsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sock->sk); mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } /* socket options apply to the CLC socket */ if (unlikely(!smc->clcsock->ops->getsockopt)) { mutex_unlock(&smc->clcsock_release_lock); return -EOPNOTSUPP; } rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); mutex_unlock(&smc->clcsock_release_lock); return rc; } int smc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { union smc_host_cursor cons, urg; struct smc_connection *conn; struct smc_sock *smc; int answ; smc = smc_sk(sock->sk); conn = &smc->conn; lock_sock(&smc->sk); if (smc->use_fallback) { if (!smc->clcsock) { release_sock(&smc->sk); return -EBADF; } answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); release_sock(&smc->sk); return answ; } switch (cmd) { case SIOCINQ: /* same as FIONREAD */ if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; else answ = atomic_read(&smc->conn.bytes_to_rcv); break; case SIOCOUTQ: /* output queue size (not send + not acked) */ if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; else answ = smc->conn.sndbuf_desc->len - atomic_read(&smc->conn.sndbuf_space); break; case SIOCOUTQNSD: /* output queue size (not send only) */ if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; else answ = smc_tx_prepared_sends(&smc->conn); break; case SIOCATMARK: if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) { answ = 0; } else { smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); smc_curs_copy(&urg, &conn->urg_curs, conn); answ = smc_curs_diff(conn->rmb_desc->len, &cons, &urg) == 1; } break; default: release_sock(&smc->sk); return -ENOIOCTLCMD; } release_sock(&smc->sk); return put_user(answ, (int __user *)arg); } /* Map the affected portions of the rmbe into an spd, note the number of bytes * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor * updates till whenever a respective page has been fully processed. * Note that subsequent recv() calls have to wait till all splice() processing * completed. */ ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc = -ENOTCONN; smc = smc_sk(sk); lock_sock(sk); if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { /* socket was connected before, no more data to read */ rc = 0; goto out; } if (sk->sk_state == SMC_INIT || sk->sk_state == SMC_LISTEN || sk->sk_state == SMC_CLOSED) goto out; if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { rc = 0; goto out; } if (smc->use_fallback) { rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, pipe, len, flags); } else { if (*ppos) { rc = -ESPIPE; goto out; } if (flags & SPLICE_F_NONBLOCK) flags = MSG_DONTWAIT; else flags = 0; SMC_STAT_INC(smc, splice_cnt); rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); } out: release_sock(sk); return rc; } /* must look like tcp */ static const struct proto_ops smc_sock_ops = { .family = PF_SMC, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; int smc_create_clcsk(struct net *net, struct sock *sk, int family) { struct smc_sock *smc = smc_sk(sk); int rc; rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &smc->clcsock); if (rc) return rc; /* smc_clcsock_release() does not wait smc->clcsock->sk's * destruction; its sk_state might not be TCP_CLOSE after * smc->sk is close()d, and TCP timers can be fired later, * which need net ref. */ sk = smc->clcsock->sk; sk_net_refcnt_upgrade(sk); return 0; } static int __smc_create(struct net *net, struct socket *sock, int protocol, int kern, struct socket *clcsock) { int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; struct smc_sock *smc; struct sock *sk; int rc; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_STREAM) goto out; rc = -EPROTONOSUPPORT; if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) goto out; rc = -ENOBUFS; sock->ops = &smc_sock_ops; sock->state = SS_UNCONNECTED; sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); rc = 0; if (clcsock) smc->clcsock = clcsock; else rc = smc_create_clcsk(net, sk, family); if (rc) { sk_common_release(sk); sock->sk = NULL; } out: return rc; } static int smc_create(struct net *net, struct socket *sock, int protocol, int kern) { return __smc_create(net, sock, protocol, kern, NULL); } static const struct net_proto_family smc_sock_family_ops = { .family = PF_SMC, .owner = THIS_MODULE, .create = smc_create, }; static int smc_ulp_init(struct sock *sk) { struct socket *tcp = sk->sk_socket; struct net *net = sock_net(sk); struct socket *smcsock; int protocol, ret; /* only TCP can be replaced */ if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) return -ESOCKTNOSUPPORT; /* don't handle wq now */ if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list) return -ENOTCONN; if (sk->sk_family == AF_INET) protocol = SMCPROTO_SMC; else protocol = SMCPROTO_SMC6; smcsock = sock_alloc(); if (!smcsock) return -ENFILE; smcsock->type = SOCK_STREAM; __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */ ret = __smc_create(net, smcsock, protocol, 1, tcp); if (ret) { sock_release(smcsock); /* module_put() which ops won't be NULL */ return ret; } /* replace tcp socket to smc */ smcsock->file = tcp->file; smcsock->file->private_data = smcsock; smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */ smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */ tcp->file = NULL; return ret; } static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(newsk); /* don't inherit ulp ops to child when listen */ icsk->icsk_ulp_ops = NULL; } static struct tcp_ulp_ops smc_ulp_ops __read_mostly = { .name = "smc", .owner = THIS_MODULE, .init = smc_ulp_init, .clone = smc_ulp_clone, }; unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { int rc; rc = smc_sysctl_net_init(net); if (rc) return rc; return smc_pnet_net_init(net); } static void __net_exit smc_net_exit(struct net *net) { smc_sysctl_net_exit(net); smc_pnet_net_exit(net); } static __net_init int smc_net_stat_init(struct net *net) { return smc_stats_init(net); } static void __net_exit smc_net_stat_exit(struct net *net) { smc_stats_exit(net); } static struct pernet_operations smc_net_ops = { .init = smc_net_init, .exit = smc_net_exit, .id = &smc_net_id, .size = sizeof(struct smc_net), }; static struct pernet_operations smc_net_stat_ops = { .init = smc_net_stat_init, .exit = smc_net_stat_exit, }; static int __init smc_init(void) { int rc; rc = register_pernet_subsys(&smc_net_ops); if (rc) return rc; rc = register_pernet_subsys(&smc_net_stat_ops); if (rc) goto out_pernet_subsys; rc = smc_ism_init(); if (rc) goto out_pernet_subsys_stat; smc_clc_init(); rc = smc_nl_init(); if (rc) goto out_ism; rc = smc_pnet_init(); if (rc) goto out_nl; rc = -ENOMEM; smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); if (!smc_tcp_ls_wq) goto out_pnet; smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); if (!smc_hs_wq) goto out_alloc_tcp_ls_wq; smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); if (!smc_close_wq) goto out_alloc_hs_wq; rc = smc_core_init(); if (rc) { pr_err("%s: smc_core_init fails with %d\n", __func__, rc); goto out_alloc_wqs; } rc = smc_llc_init(); if (rc) { pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); goto out_core; } rc = smc_cdc_init(); if (rc) { pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); goto out_core; } rc = proto_register(&smc_proto, 1); if (rc) { pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); goto out_core; } rc = proto_register(&smc_proto6, 1); if (rc) { pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); goto out_proto; } rc = sock_register(&smc_sock_family_ops); if (rc) { pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); goto out_sock; } rc = smc_loopback_init(); if (rc) { pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc); goto out_ib; } rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); goto out_lo; } rc = smc_inet_init(); if (rc) { pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); goto out_ulp; } static_branch_enable(&tcp_have_smc); return 0; out_ulp: tcp_unregister_ulp(&smc_ulp_ops); out_lo: smc_loopback_exit(); out_ib: smc_ib_unregister_client(); out_sock: sock_unregister(PF_SMC); out_proto6: proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); out_core: smc_core_exit(); out_alloc_wqs: destroy_workqueue(smc_close_wq); out_alloc_hs_wq: destroy_workqueue(smc_hs_wq); out_alloc_tcp_ls_wq: destroy_workqueue(smc_tcp_ls_wq); out_pnet: smc_pnet_exit(); out_nl: smc_nl_exit(); out_ism: smc_clc_exit(); smc_ism_exit(); out_pernet_subsys_stat: unregister_pernet_subsys(&smc_net_stat_ops); out_pernet_subsys: unregister_pernet_subsys(&smc_net_ops); return rc; } static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); smc_inet_exit(); tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); smc_loopback_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); smc_nl_exit(); smc_clc_exit(); unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); rcu_barrier(); } module_init(smc_init); module_exit(smc_exit); MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); MODULE_ALIAS_TCP_ULP("smc"); /* 256 for IPPROTO_SMC and 1 for SOCK_STREAM */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 256, 1); #if IS_ENABLED(CONFIG_IPV6) MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 256, 1); #endif /* CONFIG_IPV6 */ MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME);
114 114 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 // SPDX-License-Identifier: GPL-2.0-or-later /* * Sysfs attributes of bridge * Linux ethernet bridge * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_bridge.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/times.h> #include <linux/sched/signal.h> #include "br_private.h" /* IMPORTANT: new bridge options must be added with netlink support only * please do not add new sysfs entries */ #define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd))) /* * Common code for storing bridge parameters. */ static ssize_t store_bridge_parm(struct device *d, const char *buf, size_t len, int (*set)(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack)) { struct net_bridge *br = to_bridge(d); struct netlink_ext_ack extack = {0}; unsigned long val; int err; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; err = kstrtoul(buf, 0, &val); if (err != 0) return err; if (!rtnl_trylock()) return restart_syscall(); err = (*set)(br, val, &extack); if (!err) netdev_state_change(br->dev); if (extack._msg) { if (err) br_err(br, "%s\n", extack._msg); else br_warn(br, "%s\n", extack._msg); } rtnl_unlock(); return err ? err : len; } static ssize_t forward_delay_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } static int set_forward_delay(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_forward_delay(br, val); } static ssize_t forward_delay_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_forward_delay); } static DEVICE_ATTR_RW(forward_delay); static ssize_t hello_time_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->hello_time)); } static int set_hello_time(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_hello_time(br, val); } static ssize_t hello_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_hello_time); } static DEVICE_ATTR_RW(hello_time); static ssize_t max_age_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->max_age)); } static int set_max_age(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_max_age(br, val); } static ssize_t max_age_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_max_age); } static DEVICE_ATTR_RW(max_age); static ssize_t ageing_time_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } static int set_ageing_time(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_ageing_time(br, val); } static ssize_t ageing_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_ageing_time); } static DEVICE_ATTR_RW(ageing_time); static ssize_t stp_state_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->stp_enabled); } static int set_stp_state(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_stp_set_enabled(br, val, extack); } static ssize_t stp_state_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_stp_state); } static DEVICE_ATTR_RW(stp_state); static ssize_t group_fwd_mask_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#x\n", br->group_fwd_mask); } static int set_group_fwd_mask(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { if (val & BR_GROUPFWD_RESTRICTED) return -EINVAL; br->group_fwd_mask = val; return 0; } static ssize_t group_fwd_mask_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_group_fwd_mask); } static DEVICE_ATTR_RW(group_fwd_mask); static ssize_t priority_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } static int set_priority(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_stp_set_bridge_priority(br, (u16) val); return 0; } static ssize_t priority_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_priority); } static DEVICE_ATTR_RW(priority); static ssize_t root_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->designated_root); } static DEVICE_ATTR_RO(root_id); static ssize_t bridge_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->bridge_id); } static DEVICE_ATTR_RO(bridge_id); static ssize_t root_port_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_port); } static DEVICE_ATTR_RO(root_port); static ssize_t root_path_cost_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost); } static DEVICE_ATTR_RO(root_path_cost); static ssize_t topology_change_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->topology_change); } static DEVICE_ATTR_RO(topology_change); static ssize_t topology_change_detected_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->topology_change_detected); } static DEVICE_ATTR_RO(topology_change_detected); static ssize_t hello_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); } static DEVICE_ATTR_RO(hello_timer); static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); } static DEVICE_ATTR_RO(tcn_timer); static ssize_t topology_change_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); } static DEVICE_ATTR_RO(topology_change_timer); static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); } static DEVICE_ATTR_RO(gc_timer); static ssize_t group_addr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%pM\n", br->group_addr); } static ssize_t group_addr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct net_bridge *br = to_bridge(d); u8 new_addr[6]; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!mac_pton(buf, new_addr)) return -EINVAL; if (!is_link_local_ether_addr(new_addr)) return -EINVAL; if (new_addr[5] == 1 || /* 802.3x Pause address */ new_addr[5] == 2 || /* 802.3ad Slow protocols */ new_addr[5] == 3) /* 802.1X PAE address */ return -EINVAL; if (!rtnl_trylock()) return restart_syscall(); spin_lock_bh(&br->lock); ether_addr_copy(br->group_addr, new_addr); spin_unlock_bh(&br->lock); br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); netdev_state_change(br->dev); rtnl_unlock(); return len; } static DEVICE_ATTR_RW(group_addr); static int set_flush(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct net_bridge_fdb_flush_desc desc = { .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); return 0; } static ssize_t flush_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_flush); } static DEVICE_ATTR_WO(flush); static ssize_t no_linklocal_learn_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); } static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, extack); } static ssize_t no_linklocal_learn_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_no_linklocal_learn); } static DEVICE_ATTR_RW(no_linklocal_learn); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router); } static int set_multicast_router(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_router(&br->multicast_ctx, val); } static ssize_t multicast_router_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_router); } static DEVICE_ATTR_RW(multicast_router); static ssize_t multicast_snooping_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } static ssize_t multicast_snooping_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_multicast_toggle); } static DEVICE_ATTR_RW(multicast_snooping); static ssize_t multicast_query_use_ifaddr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); } static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); return 0; } static ssize_t multicast_query_use_ifaddr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_use_ifaddr); } static DEVICE_ATTR_RW(multicast_query_use_ifaddr); static ssize_t multicast_querier_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier); } static int set_multicast_querier(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_querier(&br->multicast_ctx, val); } static ssize_t multicast_querier_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_querier); } static DEVICE_ATTR_RW(multicast_querier); static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", RHT_ELASTICITY); } static int set_elasticity(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { /* 16 is RHT_ELASTICITY */ NL_SET_ERR_MSG_MOD(extack, "the hash_elasticity option has been deprecated and is always 16"); return 0; } static ssize_t hash_elasticity_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_elasticity); } static DEVICE_ATTR_RW(hash_elasticity); static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->hash_max); } static int set_hash_max(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->hash_max = val; return 0; } static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_hash_max); } static DEVICE_ATTR_RW(hash_max); static ssize_t multicast_igmp_version_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); } static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_igmp_version(&br->multicast_ctx, val); } static ssize_t multicast_igmp_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_igmp_version); } static DEVICE_ATTR_RW(multicast_igmp_version); static ssize_t multicast_last_member_count_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); } static int set_last_member_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_last_member_count = val; return 0; } static ssize_t multicast_last_member_count_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_count); } static DEVICE_ATTR_RW(multicast_last_member_count); static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); } static int set_startup_query_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_startup_query_count = val; return 0; } static ssize_t multicast_startup_query_count_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_count); } static DEVICE_ATTR_RW(multicast_startup_query_count); static ssize_t multicast_last_member_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); } static int set_last_member_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_last_member_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_interval); } static DEVICE_ATTR_RW(multicast_last_member_interval); static ssize_t multicast_membership_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); } static int set_membership_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_membership_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_membership_interval); } static DEVICE_ATTR_RW(multicast_membership_interval); static ssize_t multicast_querier_interval_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); } static int set_querier_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_querier_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_querier_interval); } static DEVICE_ATTR_RW(multicast_querier_interval); static ssize_t multicast_query_interval_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); } static int set_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_multicast_set_query_intvl(&br->multicast_ctx, val); return 0; } static ssize_t multicast_query_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_interval); } static DEVICE_ATTR_RW(multicast_query_interval); static ssize_t multicast_query_response_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); } static int set_query_response_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_query_response_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_response_interval); } static DEVICE_ATTR_RW(multicast_query_response_interval); static ssize_t multicast_startup_query_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); } static int set_startup_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_multicast_set_startup_query_intvl(&br->multicast_ctx, val); return 0; } static ssize_t multicast_startup_query_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_interval); } static DEVICE_ATTR_RW(multicast_startup_query_interval); static ssize_t multicast_stats_enabled_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); } static int set_stats_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val); return 0; } static ssize_t multicast_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_stats_enabled); } static DEVICE_ATTR_RW(multicast_stats_enabled); #if IS_ENABLED(CONFIG_IPV6) static ssize_t multicast_mld_version_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version); } static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_mld_version(&br->multicast_ctx, val); } static ssize_t multicast_mld_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_mld_version); } static DEVICE_ATTR_RW(multicast_mld_version); #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); } static int set_nf_call_iptables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); return 0; } static ssize_t nf_call_iptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_iptables); } static DEVICE_ATTR_RW(nf_call_iptables); static ssize_t nf_call_ip6tables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); } static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); return 0; } static ssize_t nf_call_ip6tables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_ip6tables); } static DEVICE_ATTR_RW(nf_call_ip6tables); static ssize_t nf_call_arptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); } static int set_nf_call_arptables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); return 0; } static ssize_t nf_call_arptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_arptables); } static DEVICE_ATTR_RW(nf_call_arptables); #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING static ssize_t vlan_filtering_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED)); } static ssize_t vlan_filtering_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_filter_toggle); } static DEVICE_ATTR_RW(vlan_filtering); static ssize_t vlan_protocol_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto)); } static ssize_t vlan_protocol_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_set_proto); } static DEVICE_ATTR_RW(vlan_protocol); static ssize_t default_pvid_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->default_pvid); } static ssize_t default_pvid_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid); } static DEVICE_ATTR_RW(default_pvid); static ssize_t vlan_stats_enabled_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); } static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_vlan_set_stats(br, val); } static ssize_t vlan_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_vlan_stats_enabled); } static DEVICE_ATTR_RW(vlan_stats_enabled); static ssize_t vlan_stats_per_port_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); } static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_vlan_set_stats_per_port(br, val); } static ssize_t vlan_stats_per_port_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_vlan_stats_per_port); } static DEVICE_ATTR_RW(vlan_stats_per_port); #endif static struct attribute *bridge_attrs[] = { &dev_attr_forward_delay.attr, &dev_attr_hello_time.attr, &dev_attr_max_age.attr, &dev_attr_ageing_time.attr, &dev_attr_stp_state.attr, &dev_attr_group_fwd_mask.attr, &dev_attr_priority.attr, &dev_attr_bridge_id.attr, &dev_attr_root_id.attr, &dev_attr_root_path_cost.attr, &dev_attr_root_port.attr, &dev_attr_topology_change.attr, &dev_attr_topology_change_detected.attr, &dev_attr_hello_timer.attr, &dev_attr_tcn_timer.attr, &dev_attr_topology_change_timer.attr, &dev_attr_gc_timer.attr, &dev_attr_group_addr.attr, &dev_attr_flush.attr, &dev_attr_no_linklocal_learn.attr, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &dev_attr_multicast_router.attr, &dev_attr_multicast_snooping.attr, &dev_attr_multicast_querier.attr, &dev_attr_multicast_query_use_ifaddr.attr, &dev_attr_hash_elasticity.attr, &dev_attr_hash_max.attr, &dev_attr_multicast_last_member_count.attr, &dev_attr_multicast_startup_query_count.attr, &dev_attr_multicast_last_member_interval.attr, &dev_attr_multicast_membership_interval.attr, &dev_attr_multicast_querier_interval.attr, &dev_attr_multicast_query_interval.attr, &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, &dev_attr_multicast_stats_enabled.attr, &dev_attr_multicast_igmp_version.attr, #if IS_ENABLED(CONFIG_IPV6) &dev_attr_multicast_mld_version.attr, #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING &dev_attr_vlan_filtering.attr, &dev_attr_vlan_protocol.attr, &dev_attr_default_pvid.attr, &dev_attr_vlan_stats_enabled.attr, &dev_attr_vlan_stats_per_port.attr, #endif NULL }; static const struct attribute_group bridge_group = { .name = SYSFS_BRIDGE_ATTR, .attrs = bridge_attrs, }; /* * Export the forwarding information table as a binary file * The records are struct __fdb_entry. * * Returns the number of bytes read. */ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); struct net_bridge *br = to_bridge(dev); int n; /* must read whole records */ if (off % sizeof(struct __fdb_entry) != 0) return -EINVAL; n = br_fdb_fillbuf(br, buf, count / sizeof(struct __fdb_entry), off / sizeof(struct __fdb_entry)); if (n > 0) n *= sizeof(struct __fdb_entry); return n; } static const struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, .mode = 0444, }, .read_new = brforward_read, }; /* * Add entries in sysfs onto the existing network class device * for the bridge. * Adds a attribute group "bridge" containing tuning parameters. * Binary attribute containing the forward table * Sub directory to hold links to interfaces. * * Note: the ifobj exists only to be a subdirectory * to hold links. The ifobj exists in same data structure * as it's parent the bridge so reference counting works. */ int br_sysfs_addbr(struct net_device *dev) { struct kobject *brobj = &dev->dev.kobj; struct net_bridge *br = netdev_priv(dev); int err; err = sysfs_create_group(brobj, &bridge_group); if (err) { pr_info("%s: can't create group %s/%s\n", __func__, dev->name, bridge_group.name); goto out1; } err = sysfs_create_bin_file(brobj, &bridge_forward); if (err) { pr_info("%s: can't create attribute file %s/%s\n", __func__, dev->name, bridge_forward.attr.name); goto out2; } br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, brobj); if (!br->ifobj) { pr_info("%s: can't add kobject (directory) %s/%s\n", __func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR); err = -ENOMEM; goto out3; } return 0; out3: sysfs_remove_bin_file(&dev->dev.kobj, &bridge_forward); out2: sysfs_remove_group(&dev->dev.kobj, &bridge_group); out1: return err; } void br_sysfs_delbr(struct net_device *dev) { struct kobject *kobj = &dev->dev.kobj; struct net_bridge *br = netdev_priv(dev); kobject_put(br->ifobj); sysfs_remove_bin_file(kobj, &bridge_forward); sysfs_remove_group(kobj, &bridge_group); }
4 28 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RDS_RDS_H #define _RDS_RDS_H #include <net/sock.h> #include <linux/scatterlist.h> #include <linux/highmem.h> #include <rdma/rdma_cm.h> #include <linux/mutex.h> #include <linux/rds.h> #include <linux/rhashtable.h> #include <linux/refcount.h> #include <linux/in6.h> #include "info.h" /* * RDS Network protocol version */ #define RDS_PROTOCOL_3_0 0x0300 #define RDS_PROTOCOL_3_1 0x0301 #define RDS_PROTOCOL_4_0 0x0400 #define RDS_PROTOCOL_4_1 0x0401 #define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 #define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) #define RDS_PROTOCOL_COMPAT_VERSION RDS_PROTOCOL_3_1 /* The following ports, 16385, 18634, 18635, are registered with IANA as * the ports to be used for RDS over TCP and UDP. Currently, only RDS over * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept * to ensure compatibility with older RDS modules. Those ports are defined * in each transport's header file. */ #define RDS_PORT 18634 #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif #ifdef RDS_DEBUG #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else /* sigh, pr_debug() causes unused variable warnings */ static inline __printf(1, 2) void rdsdebug(char *fmt, ...) { } #endif #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) /* Used to limit both RDMA and non-RDMA RDS message to 1MB */ #define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20)) #define RDS_CONG_MAP_BYTES (65536 / 8) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) struct rds_cong_map { struct rb_node m_rb_node; struct in6_addr m_addr; wait_queue_head_t m_waitq; struct list_head m_conn_list; unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; }; /* * This is how we will track the connection state: * A connection is always in one of the following * states. Updates to the state are atomic and imply * a memory barrier. */ enum { RDS_CONN_DOWN = 0, RDS_CONN_CONNECTING, RDS_CONN_DISCONNECTING, RDS_CONN_UP, RDS_CONN_RESETTING, RDS_CONN_ERROR, }; /* Bits for c_flags */ #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 #define RDS_RECV_REFILL 3 #define RDS_DESTROY_PENDING 4 /* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 #define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \ (rs)->rs_hash_initval) & ((n) - 1)) #define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr)) /* Per mpath connection state */ struct rds_conn_path { struct rds_connection *cp_conn; struct rds_message *cp_xmit_rm; unsigned long cp_xmit_sg; unsigned int cp_xmit_hdr_off; unsigned int cp_xmit_data_off; unsigned int cp_xmit_atomic_sent; unsigned int cp_xmit_rdma_sent; unsigned int cp_xmit_data_sent; spinlock_t cp_lock; /* protect msg queues */ u64 cp_next_tx_seq; struct list_head cp_send_queue; struct list_head cp_retrans; u64 cp_next_rx_seq; void *cp_transport_data; atomic_t cp_state; unsigned long cp_send_gen; unsigned long cp_flags; unsigned long cp_reconnect_jiffies; struct delayed_work cp_send_w; struct delayed_work cp_recv_w; struct delayed_work cp_conn_w; struct work_struct cp_down_w; struct mutex cp_cm_lock; /* protect cp_state & cm */ wait_queue_head_t cp_waitq; unsigned int cp_unacked_packets; unsigned int cp_unacked_bytes; unsigned int cp_index; }; /* One rds_connection per RDS address pair */ struct rds_connection { struct hlist_node c_hash_node; struct in6_addr c_laddr; struct in6_addr c_faddr; int c_dev_if; /* ifindex used for this conn */ int c_bound_if; /* ifindex of c_laddr */ unsigned int c_loopback:1, c_isv6:1, c_ping_triggered:1, c_pad_to_32:29; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; /* Protocol version */ unsigned int c_proposed_version; unsigned int c_version; possible_net_t c_net; /* TOS */ u8 c_tos; struct list_head c_map_item; unsigned long c_map_queued; struct rds_conn_path *c_path; wait_queue_head_t c_hs_waitq; /* handshake waitq */ u32 c_my_gen_num; u32 c_peer_gen_num; }; static inline struct net *rds_conn_net(struct rds_connection *conn) { return read_pnet(&conn->c_net); } static inline void rds_conn_net_set(struct rds_connection *conn, struct net *net) { write_pnet(&conn->c_net, net); } #define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_ACK_REQUIRED 0x02 #define RDS_FLAG_RETRANSMITTED 0x04 #define RDS_MAX_ADV_CREDIT 255 /* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping * probe to exchange control information before establishing a connection. * Currently the control information that is exchanged is the number of * supported paths. If the peer is a legacy (older kernel revision) peer, * it would return a pong message without additional control information * that would then alert the sender that the peer was an older rev. */ #define RDS_FLAG_PROBE_PORT 1 #define RDS_HS_PROBE(sport, dport) \ ((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \ (sport == 0 && dport == RDS_FLAG_PROBE_PORT)) /* * Maximum space available for extension headers. */ #define RDS_HEADER_EXT_SPACE 16 struct rds_header { __be64 h_sequence; __be64 h_ack; __be32 h_len; __be16 h_sport; __be16 h_dport; u8 h_flags; u8 h_credit; u8 h_padding[4]; __sum16 h_csum; u8 h_exthdr[RDS_HEADER_EXT_SPACE]; }; /* * Reserved - indicates end of extensions */ #define RDS_EXTHDR_NONE 0 /* * This extension header is included in the very * first message that is sent on a new connection, * and identifies the protocol level. This will help * rolling updates if a future change requires breaking * the protocol. * NB: This is no longer true for IB, where we do a version * negotiation during the connection setup phase (protocol * version information is included in the RDMA CM private data). */ #define RDS_EXTHDR_VERSION 1 struct rds_ext_header_version { __be32 h_version; }; /* * This extension header is included in the RDS message * chasing an RDMA operation. */ #define RDS_EXTHDR_RDMA 2 struct rds_ext_header_rdma { __be32 h_rdma_rkey; }; /* * This extension header tells the peer about the * destination <R_Key,offset> of the requested RDMA * operation. */ #define RDS_EXTHDR_RDMA_DEST 3 struct rds_ext_header_rdma_dest { __be32 h_rdma_rkey; __be32 h_rdma_offset; }; /* Extension header announcing number of paths. * Implicit length = 2 bytes. */ #define RDS_EXTHDR_NPATHS 5 #define RDS_EXTHDR_GEN_NUM 6 #define __RDS_EXTHDR_MAX 16 /* for now */ #define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) #define RDS_MSG_RX_HDR 0 #define RDS_MSG_RX_START 1 #define RDS_MSG_RX_END 2 #define RDS_MSG_RX_CMSG 3 /* The following values are whitelisted for usercopy */ struct rds_inc_usercopy { rds_rdma_cookie_t rdma_cookie; ktime_t rx_tstamp; }; struct rds_incoming { refcount_t i_refcount; struct list_head i_item; struct rds_connection *i_conn; struct rds_conn_path *i_conn_path; struct rds_header i_hdr; unsigned long i_rx_jiffies; struct in6_addr i_saddr; struct rds_inc_usercopy i_usercopy; u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; }; struct rds_mr { struct rb_node r_rb_node; struct kref r_kref; u32 r_key; /* A copy of the creation flags */ unsigned int r_use_once:1; unsigned int r_invalidate:1; unsigned int r_write:1; struct rds_sock *r_sock; /* back pointer to the socket that owns us */ struct rds_transport *r_trans; void *r_trans_private; }; static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) { return r_key | (((u64) offset) << 32); } static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) { return cookie; } static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) { return cookie >> 32; } /* atomic operation types */ #define RDS_ATOMIC_TYPE_CSWP 0 #define RDS_ATOMIC_TYPE_FADD 1 /* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty * the message will not be put back on the retransmit list after being sent. * messages that are canceled while being sent rely on this. * * m_inc is used by loopback so that it can pass an incoming message straight * back up into the rx path. It embeds a wire header which is also used by * the send path, which is kind of awkward. * * m_sock_item indicates the message's presence on a socket's send or receive * queue. m_rs will point to that socket. * * m_daddr is used by cancellation to prune messages to a given destination. * * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock * nesting. As paths iterate over messages on a sock, or conn, they must * also lock the conn, or sock, to remove the message from those lists too. * Testing the flag to determine if the message is still on the lists lets * us avoid testing the list_head directly. That means each path can use * the message's list_head to keep it on a local list while juggling locks * without confusing the other path. * * m_ack_seq is an optional field set by transports who need a different * sequence number range to invalidate. They can use this in a callback * that they pass to rds_send_drop_acked() to see if each message has been * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't * had ack_seq set yet. */ #define RDS_MSG_ON_SOCK 1 #define RDS_MSG_ON_CONN 2 #define RDS_MSG_HAS_ACK_SEQ 3 #define RDS_MSG_ACK_REQUIRED 4 #define RDS_MSG_RETRANSMITTED 5 #define RDS_MSG_MAPPED 6 #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8 struct rds_znotifier { struct mmpin z_mmp; u32 z_cookie; }; struct rds_msg_zcopy_info { struct list_head rs_zcookie_next; union { struct rds_znotifier znotif; struct rds_zcopy_cookies zcookies; }; }; struct rds_msg_zcopy_queue { struct list_head zcookie_head; spinlock_t lock; /* protects zcookie_head queue */ }; static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q) { spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->zcookie_head); } struct rds_iov_vector { struct rds_iovec *iov; int len; }; struct rds_iov_vector_arr { struct rds_iov_vector *vec; int len; int indx; int incr; }; struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; struct list_head m_conn_item; struct rds_incoming m_inc; u64 m_ack_seq; struct in6_addr m_daddr; unsigned long m_flags; /* Never access m_rs without holding m_rs_lock. * Lock nesting is * rm->m_rs_lock * -> rs->rs_lock */ spinlock_t m_rs_lock; wait_queue_head_t m_flush_wait; struct rds_sock *m_rs; /* cookie to send to remote, in rds header */ rds_rdma_cookie_t m_rdma_cookie; unsigned int m_used_sgs; unsigned int m_total_sgs; void *m_final_op; struct { struct rm_atomic_op { int op_type; union { struct { uint64_t compare; uint64_t swap; uint64_t compare_mask; uint64_t swap_mask; } op_m_cswp; struct { uint64_t add; uint64_t nocarry_mask; } op_m_fadd; }; u32 op_rkey; u64 op_remote_addr; unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; struct scatterlist *op_sg; struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; } atomic; struct rm_rdma_op { u32 op_rkey; u64 op_remote_addr; unsigned int op_write:1; unsigned int op_fence:1; unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; unsigned int op_bytes; unsigned int op_nents; unsigned int op_count; struct scatterlist *op_sg; struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; u64 op_odp_addr; struct rds_mr *op_odp_mr; } rdma; struct rm_data_op { unsigned int op_active:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; unsigned int op_dmaoff; struct rds_znotifier *op_mmp_znotifier; struct scatterlist *op_sg; } data; }; struct rds_conn_path *m_conn_path; }; /* * The RDS notifier is used (optionally) to tell the application about * completed RDMA operations. Rather than keeping the whole rds message * around on the queue, we allocate a small notifier that is put on the * socket's notifier_list. Notifications are delivered to the application * through control messages. */ struct rds_notifier { struct list_head n_list; uint64_t n_user_token; int n_status; }; /* Available as part of RDS core, so doesn't need to participate * in get_preferred transport etc */ #define RDS_TRANS_LOOP 3 /** * struct rds_transport - transport specific behavioural hooks * * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send * part of a message. The caller serializes on the send_sem so this * doesn't need to be reentrant for a given conn. The header must be * sent before the data payload. .xmit must be prepared to send a * message with no data payload. .xmit should return the number of * bytes that were sent down the connection, including header bytes. * Returning 0 tells the caller that it doesn't need to perform any * additional work now. This is usually the case when the transport has * filled the sending queue for its connection and will handle * triggering the rds thread to continue the send when space becomes * available. Returning -EAGAIN tells the caller to retry the send * immediately. Returning -ENOMEM tells the caller to retry the send at * some point in the future. * * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once * it returns the connection can not call rds_recv_incoming(). * This will only be called once after conn_connect returns * non-zero success and will The caller serializes this with * the send and connecting paths (xmit_* and conn_*). The * transport is responsible for other serialization, including * rds_recv_incoming(). This is called in process context but * should try hard not to block. */ struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; struct module *t_owner; unsigned int t_prefer_loopback:1, t_mp_capable:1; unsigned int t_type; int (*laddr_check)(struct net *net, const struct in6_addr *addr, __u32 scope_id); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_path_connect)(struct rds_conn_path *cp); void (*conn_path_shutdown)(struct rds_conn_path *conn); void (*xmit_path_prepare)(struct rds_conn_path *cp); void (*xmit_path_complete)(struct rds_conn_path *cp); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); int (*recv_path)(struct rds_conn_path *cp); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to); void (*inc_free)(struct rds_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, bool isv6); int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6); void (*cm_connect_complete)(struct rds_connection *conn, struct rdma_cm_event *event); unsigned int (*stats_info_copy)(struct rds_info_iterator *iter, unsigned int avail); void (*exit)(void); void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, struct rds_sock *rs, u32 *key_ret, struct rds_connection *conn, u64 start, u64 length, int need_odp); void (*sync_mr)(void *trans_private, int direction); void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); bool (*t_unloading)(struct rds_connection *conn); u8 (*get_tos_map)(u8 tos); }; /* Bind hash table key length. It is the sum of the size of a struct * in6_addr, a scope_id and a port. */ #define RDS_BOUND_KEY_LEN \ (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16)) struct rds_sock { struct sock rs_sk; u64 rs_user_addr; u64 rs_user_bytes; /* * bound_addr used for both incoming and outgoing, no INADDR_ANY * support. */ struct rhash_head rs_bound_node; u8 rs_bound_key[RDS_BOUND_KEY_LEN]; struct sockaddr_in6 rs_bound_sin6; #define rs_bound_addr rs_bound_sin6.sin6_addr #define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3] #define rs_bound_port rs_bound_sin6.sin6_port #define rs_bound_scope_id rs_bound_sin6.sin6_scope_id struct in6_addr rs_conn_addr; #define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3] __be16 rs_conn_port; struct rds_transport *rs_transport; /* * rds_sendmsg caches the conn it used the last time around. * This helps avoid costly lookups. */ struct rds_connection *rs_conn; /* flag indicating we were congested or not */ int rs_congested; /* seen congestion (ENOBUFS) when sending? */ int rs_seen_congestion; /* rs_lock protects all these adjacent members before the newline */ spinlock_t rs_lock; struct list_head rs_send_queue; u32 rs_snd_bytes; int rs_rcv_bytes; struct list_head rs_notify_queue; /* currently used for failed RDMAs */ /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask * to decide whether the application should be woken up. * If not set, we use rs_cong_track to find out whether a cong map * update arrived. */ uint64_t rs_cong_mask; uint64_t rs_cong_notify; struct list_head rs_cong_list; unsigned long rs_cong_track; /* * rs_recv_lock protects the receive queue, and is * used to serialize with rds_release. */ rwlock_t rs_recv_lock; struct list_head rs_recv_queue; /* just for stats reporting */ struct list_head rs_item; /* these have their own lock */ spinlock_t rs_rdma_lock; struct rb_root rs_rdma_keys; /* Socket options - in case there will be more */ unsigned char rs_recverr, rs_cong_monitor; u32 rs_hash_initval; /* Socket receive path trace points*/ u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; struct rds_msg_zcopy_queue rs_zcookie_queue; u8 rs_tos; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) { return container_of(sk, struct rds_sock, rs_sk); } static inline struct sock *rds_rs_to_sk(struct rds_sock *rs) { return &rs->rs_sk; } /* * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value * to account for overhead. We don't account for overhead, we just apply * the number of payload bytes to the specified value. */ static inline int rds_sk_sndbuf(struct rds_sock *rs) { return rds_rs_to_sk(rs)->sk_sndbuf / 2; } static inline int rds_sk_rcvbuf(struct rds_sock *rs) { return rds_rs_to_sk(rs)->sk_rcvbuf / 2; } struct rds_statistics { uint64_t s_conn_reset; uint64_t s_recv_drop_bad_checksum; uint64_t s_recv_drop_old_seq; uint64_t s_recv_drop_no_sock; uint64_t s_recv_drop_dead_sock; uint64_t s_recv_deliver_raced; uint64_t s_recv_delivered; uint64_t s_recv_queued; uint64_t s_recv_immediate_retry; uint64_t s_recv_delayed_retry; uint64_t s_recv_ack_required; uint64_t s_recv_rdma_bytes; uint64_t s_recv_ping; uint64_t s_send_queue_empty; uint64_t s_send_queue_full; uint64_t s_send_lock_contention; uint64_t s_send_lock_queue_raced; uint64_t s_send_immediate_retry; uint64_t s_send_delayed_retry; uint64_t s_send_drop_acked; uint64_t s_send_ack_required; uint64_t s_send_queued; uint64_t s_send_rdma; uint64_t s_send_rdma_bytes; uint64_t s_send_pong; uint64_t s_page_remainder_hit; uint64_t s_page_remainder_miss; uint64_t s_copy_to_user; uint64_t s_copy_from_user; uint64_t s_cong_update_queued; uint64_t s_cong_update_received; uint64_t s_cong_send_error; uint64_t s_cong_send_blocked; uint64_t s_recv_bytes_added_to_socket; uint64_t s_recv_bytes_removed_from_socket; uint64_t s_send_stuck_rm; }; /* af_rds.c */ void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); static inline void __rds_wake_sk_sleep(struct sock *sk) { wait_queue_head_t *waitq = sk_sleep(sk); if (!sock_flag(sk, SOCK_DEAD) && waitq) wake_up(waitq); } extern wait_queue_head_t rds_poll_waitq; /* bind.c */ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id); int rds_bind_lock_init(void); void rds_bind_lock_destroy(void); /* cong.c */ int rds_cong_get_maps(struct rds_connection *conn); void rds_cong_add_conn(struct rds_connection *conn); void rds_cong_remove_conn(struct rds_connection *conn); void rds_cong_set_bit(struct rds_cong_map *map, __be16 port); void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port); int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs); void rds_cong_queue_updates(struct rds_cong_map *map); void rds_cong_map_updated(struct rds_cong_map *map, uint64_t); int rds_cong_updated_since(unsigned long *recent); void rds_cong_add_socket(struct rds_sock *); void rds_cong_remove_socket(struct rds_sock *); void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* connection.c */ extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if); void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy); void rds_conn_connect_if_down(struct rds_connection *conn); void rds_conn_path_connect_if_down(struct rds_conn_path *cp); void rds_check_all_paths(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), u64 *buffer, size_t item_len); __printf(2, 3) void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...); #define rds_conn_path_error(cp, fmt...) \ __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt) static inline int rds_conn_path_transition(struct rds_conn_path *cp, int old, int new) { return atomic_cmpxchg(&cp->cp_state, old, new) == old; } static inline int rds_conn_transition(struct rds_connection *conn, int old, int new) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_transition(&conn->c_path[0], old, new); } static inline int rds_conn_path_state(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state); } static inline int rds_conn_state(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_state(&conn->c_path[0]); } static inline int rds_conn_path_up(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_UP; } static inline int rds_conn_path_down(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_DOWN; } static inline int rds_conn_up(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_up(&conn->c_path[0]); } static inline int rds_conn_path_connecting(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING; } static inline int rds_conn_connecting(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_connecting(&conn->c_path[0]); } /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, bool zcopy); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); int rds_message_add_extension(struct rds_header *hdr, unsigned int type, const void *data, unsigned int len); int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); void rds_message_wait(struct rds_message *rm); void rds_message_unmapped(struct rds_message *rm); void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info); static inline void rds_message_make_checksum(struct rds_header *hdr) { hdr->h_csum = 0; hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2); } static inline int rds_message_verify_checksum(const struct rds_header *hdr) { return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0; } /* page.c */ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp); void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, struct in6_addr *saddr); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, struct in6_addr *saddr); void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp); int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags); void rds_clear_recv_queue(struct rds_sock *rs); int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, __be32 saddr, __be32 daddr, int flip); void rds6_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, struct in6_addr *saddr, struct in6_addr *daddr, int flip); /* send.c */ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); void rds_send_path_reset(struct rds_conn_path *conn); int rds_send_xmit(struct rds_conn_path *cp); struct sockaddr_in; void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, is_acked_func is_acked); void rds_send_ping(struct rds_connection *conn, int cp_index); int rds_send_pong(struct rds_conn_path *cp, __be16 dport); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen); int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen); int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen); void rds_rdma_drop_keys(struct rds_sock *rs); int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov); int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg, struct rds_iov_vector *vec); int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void rds_rdma_free_op(struct rm_rdma_op *ro); void rds_atomic_free_op(struct rm_atomic_op *ao); void rds_rdma_send_complete(struct rds_message *rm, int wc_status); void rds_atomic_send_complete(struct rds_message *rm, int wc_status); int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void __rds_put_mr_final(struct kref *kref); static inline bool rds_destroy_pending(struct rds_connection *conn) { return !check_net(rds_conn_net(conn)) || (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); } enum { ODP_NOT_NEEDED, ODP_ZEROBASED, ODP_VIRTUAL }; /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); #define rds_stats_inc_which(which, member) do { \ per_cpu(which, get_cpu()).member++; \ put_cpu(); \ } while (0) #define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member) #define rds_stats_add_which(which, member, count) do { \ per_cpu(which, get_cpu()).member += count; \ put_cpu(); \ } while (0) #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) int rds_stats_init(void); void rds_stats_exit(void); void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr); /* sysctl.c */ int rds_sysctl_init(void); void rds_sysctl_exit(void); extern unsigned long rds_sysctl_sndbuf_min; extern unsigned long rds_sysctl_sndbuf_default; extern unsigned long rds_sysctl_sndbuf_max; extern unsigned long rds_sysctl_reconnect_min_jiffies; extern unsigned long rds_sysctl_reconnect_max_jiffies; extern unsigned int rds_sysctl_max_unacked_packets; extern unsigned int rds_sysctl_max_unacked_bytes; extern unsigned int rds_sysctl_ping_enable; extern unsigned long rds_sysctl_trace_flags; extern unsigned int rds_sysctl_trace_level; /* threads.c */ int rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; void rds_queue_reconnect(struct rds_conn_path *cp); void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); void rds_connect_path_complete(struct rds_conn_path *conn, int curr); void rds_connect_complete(struct rds_connection *conn); int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2); /* transport.c */ void rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); struct rds_transport *rds_trans_get_preferred(struct net *net, const struct in6_addr *addr, __u32 scope_id); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); struct rds_transport *rds_trans_get(int t_type); #endif
23 35 37 17 17 5 5 63 71 53 52 39 49 60 68 68 58 17 7 11 5 257 260 17 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 /* BlueZ - Bluetooth protocol stack for Linux Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved. Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #ifndef __HCI_CORE_H #define __HCI_CORE_H #include <linux/idr.h> #include <linux/leds.h> #include <linux/rculist.h> #include <net/bluetooth/hci.h> #include <net/bluetooth/hci_sync.h> #include <net/bluetooth/hci_sock.h> #include <net/bluetooth/coredump.h> /* HCI priority */ #define HCI_PRIO_MAX 7 /* HCI maximum id value */ #define HCI_MAX_ID 10000 /* HCI Core structures */ struct inquiry_data { bdaddr_t bdaddr; __u8 pscan_rep_mode; __u8 pscan_period_mode; __u8 pscan_mode; __u8 dev_class[3]; __le16 clock_offset; __s8 rssi; __u8 ssp_mode; }; struct inquiry_entry { struct list_head all; /* inq_cache.all */ struct list_head list; /* unknown or resolve */ enum { NAME_NOT_KNOWN, NAME_NEEDED, NAME_PENDING, NAME_KNOWN, } name_state; __u32 timestamp; struct inquiry_data data; }; struct discovery_state { int type; enum { DISCOVERY_STOPPED, DISCOVERY_STARTING, DISCOVERY_FINDING, DISCOVERY_RESOLVING, DISCOVERY_STOPPING, } state; struct list_head all; /* All devices found during inquiry */ struct list_head unknown; /* Name state not known */ struct list_head resolve; /* Name needs to be resolved */ __u32 timestamp; bdaddr_t last_adv_addr; u8 last_adv_addr_type; s8 last_adv_rssi; u32 last_adv_flags; u8 last_adv_data[HCI_MAX_EXT_AD_LENGTH]; u8 last_adv_data_len; bool report_invalid_rssi; bool result_filtering; bool limited; s8 rssi; u16 uuid_count; u8 (*uuids)[16]; unsigned long name_resolve_timeout; }; #define SUSPEND_NOTIFIER_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ enum suspend_tasks { SUSPEND_PAUSE_DISCOVERY, SUSPEND_UNPAUSE_DISCOVERY, SUSPEND_PAUSE_ADVERTISING, SUSPEND_UNPAUSE_ADVERTISING, SUSPEND_SCAN_DISABLE, SUSPEND_SCAN_ENABLE, SUSPEND_DISCONNECTING, SUSPEND_POWERING_DOWN, SUSPEND_PREPARE_NOTIFIER, SUSPEND_SET_ADV_FILTER, __SUSPEND_NUM_TASKS }; enum suspended_state { BT_RUNNING = 0, BT_SUSPEND_DISCONNECT, BT_SUSPEND_CONFIGURE_WAKE, }; struct hci_conn_hash { struct list_head list; unsigned int acl_num; unsigned int sco_num; unsigned int iso_num; unsigned int le_num; unsigned int le_num_peripheral; }; struct bdaddr_list { struct list_head list; bdaddr_t bdaddr; u8 bdaddr_type; }; struct codec_list { struct list_head list; u8 id; __u16 cid; __u16 vid; u8 transport; u8 num_caps; u32 len; struct hci_codec_caps caps[]; }; struct bdaddr_list_with_irk { struct list_head list; bdaddr_t bdaddr; u8 bdaddr_type; u8 peer_irk[16]; u8 local_irk[16]; }; /* Bitmask of connection flags */ enum hci_conn_flags { HCI_CONN_FLAG_REMOTE_WAKEUP = BIT(0), HCI_CONN_FLAG_DEVICE_PRIVACY = BIT(1), HCI_CONN_FLAG_ADDRESS_RESOLUTION = BIT(2), }; typedef u8 hci_conn_flags_t; struct bdaddr_list_with_flags { struct list_head list; bdaddr_t bdaddr; u8 bdaddr_type; hci_conn_flags_t flags; }; struct bt_uuid { struct list_head list; u8 uuid[16]; u8 size; u8 svc_hint; }; struct blocked_key { struct list_head list; struct rcu_head rcu; u8 type; u8 val[16]; }; struct smp_csrk { bdaddr_t bdaddr; u8 bdaddr_type; u8 type; u8 val[16]; }; struct smp_ltk { struct list_head list; struct rcu_head rcu; bdaddr_t bdaddr; u8 bdaddr_type; u8 authenticated; u8 type; u8 enc_size; __le16 ediv; __le64 rand; u8 val[16]; }; struct smp_irk { struct list_head list; struct rcu_head rcu; bdaddr_t rpa; bdaddr_t bdaddr; u8 addr_type; u8 val[16]; }; struct link_key { struct list_head list; struct rcu_head rcu; bdaddr_t bdaddr; u8 type; u8 val[HCI_LINK_KEY_SIZE]; u8 pin_len; }; struct oob_data { struct list_head list; bdaddr_t bdaddr; u8 bdaddr_type; u8 present; u8 hash192[16]; u8 rand192[16]; u8 hash256[16]; u8 rand256[16]; }; struct adv_info { struct list_head list; bool enabled; bool pending; bool periodic; __u8 mesh; __u8 instance; __u8 handle; __u32 flags; __u16 timeout; __u16 remaining_time; __u16 duration; __u16 adv_data_len; __u8 adv_data[HCI_MAX_EXT_AD_LENGTH]; bool adv_data_changed; __u16 scan_rsp_len; __u8 scan_rsp_data[HCI_MAX_EXT_AD_LENGTH]; bool scan_rsp_changed; __u16 per_adv_data_len; __u8 per_adv_data[HCI_MAX_PER_AD_LENGTH]; __s8 tx_power; __u32 min_interval; __u32 max_interval; bdaddr_t random_addr; bool rpa_expired; struct delayed_work rpa_expired_cb; }; struct tx_queue { struct sk_buff_head queue; unsigned int extra; unsigned int tracked; }; #define HCI_MAX_ADV_INSTANCES 5 #define HCI_DEFAULT_ADV_DURATION 2 #define HCI_ADV_TX_POWER_NO_PREFERENCE 0x7F #define DATA_CMP(_d1, _l1, _d2, _l2) \ (_l1 == _l2 ? memcmp(_d1, _d2, _l1) : _l1 - _l2) #define ADV_DATA_CMP(_adv, _data, _len) \ DATA_CMP((_adv)->adv_data, (_adv)->adv_data_len, _data, _len) #define SCAN_RSP_CMP(_adv, _data, _len) \ DATA_CMP((_adv)->scan_rsp_data, (_adv)->scan_rsp_len, _data, _len) struct monitored_device { struct list_head list; bdaddr_t bdaddr; __u8 addr_type; __u16 handle; bool notified; }; struct adv_pattern { struct list_head list; __u8 ad_type; __u8 offset; __u8 length; __u8 value[HCI_MAX_EXT_AD_LENGTH]; }; struct adv_rssi_thresholds { __s8 low_threshold; __s8 high_threshold; __u16 low_threshold_timeout; __u16 high_threshold_timeout; __u8 sampling_period; }; struct adv_monitor { struct list_head patterns; struct adv_rssi_thresholds rssi; __u16 handle; enum { ADV_MONITOR_STATE_NOT_REGISTERED, ADV_MONITOR_STATE_REGISTERED, ADV_MONITOR_STATE_OFFLOADED } state; }; #define HCI_MIN_ADV_MONITOR_HANDLE 1 #define HCI_MAX_ADV_MONITOR_NUM_HANDLES 32 #define HCI_MAX_ADV_MONITOR_NUM_PATTERNS 16 #define HCI_ADV_MONITOR_EXT_NONE 1 #define HCI_ADV_MONITOR_EXT_MSFT 2 #define HCI_MAX_SHORT_NAME_LENGTH 10 #define HCI_CONN_HANDLE_MAX 0x0eff #define HCI_CONN_HANDLE_UNSET(_handle) (_handle > HCI_CONN_HANDLE_MAX) /* Min encryption key size to match with SMP */ #define HCI_MIN_ENC_KEY_SIZE 7 /* Default LE RPA expiry time, 15 minutes */ #define HCI_DEFAULT_RPA_TIMEOUT (15 * 60) /* Default min/max age of connection information (1s/3s) */ #define DEFAULT_CONN_INFO_MIN_AGE 1000 #define DEFAULT_CONN_INFO_MAX_AGE 3000 /* Default authenticated payload timeout 30s */ #define DEFAULT_AUTH_PAYLOAD_TIMEOUT 0x0bb8 #define HCI_MAX_PAGES 3 struct hci_dev { struct list_head list; struct mutex lock; struct ida unset_handle_ida; const char *name; unsigned long flags; __u16 id; __u8 bus; bdaddr_t bdaddr; bdaddr_t setup_addr; bdaddr_t public_addr; bdaddr_t random_addr; bdaddr_t static_addr; __u8 adv_addr_type; __u8 dev_name[HCI_MAX_NAME_LENGTH]; __u8 short_name[HCI_MAX_SHORT_NAME_LENGTH]; __u8 eir[HCI_MAX_EIR_LENGTH]; __u16 appearance; __u8 dev_class[3]; __u8 major_class; __u8 minor_class; __u8 max_page; __u8 features[HCI_MAX_PAGES][8]; __u8 le_features[8]; __u8 le_accept_list_size; __u8 le_resolv_list_size; __u8 le_num_of_adv_sets; __u8 le_states[8]; __u8 mesh_ad_types[16]; __u8 mesh_send_ref; __u8 commands[64]; __u8 hci_ver; __u16 hci_rev; __u8 lmp_ver; __u16 manufacturer; __u16 lmp_subver; __u16 voice_setting; __u8 num_iac; __u16 stored_max_keys; __u16 stored_num_keys; __u8 io_capability; __s8 inq_tx_power; __u8 err_data_reporting; __u16 page_scan_interval; __u16 page_scan_window; __u8 page_scan_type; __u8 le_adv_channel_map; __u16 le_adv_min_interval; __u16 le_adv_max_interval; __u8 le_scan_type; __u16 le_scan_interval; __u16 le_scan_window; __u16 le_scan_int_suspend; __u16 le_scan_window_suspend; __u16 le_scan_int_discovery; __u16 le_scan_window_discovery; __u16 le_scan_int_adv_monitor; __u16 le_scan_window_adv_monitor; __u16 le_scan_int_connect; __u16 le_scan_window_connect; __u16 le_conn_min_interval; __u16 le_conn_max_interval; __u16 le_conn_latency; __u16 le_supv_timeout; __u16 le_def_tx_len; __u16 le_def_tx_time; __u16 le_max_tx_len; __u16 le_max_tx_time; __u16 le_max_rx_len; __u16 le_max_rx_time; __u8 le_max_key_size; __u8 le_min_key_size; __u16 discov_interleaved_timeout; __u16 conn_info_min_age; __u16 conn_info_max_age; __u16 auth_payload_timeout; __u8 min_enc_key_size; __u8 max_enc_key_size; __u8 pairing_opts; __u8 ssp_debug_mode; __u8 hw_error_code; __u32 clock; __u16 advmon_allowlist_duration; __u16 advmon_no_filter_duration; __u8 enable_advmon_interleave_scan; __u16 devid_source; __u16 devid_vendor; __u16 devid_product; __u16 devid_version; __u8 def_page_scan_type; __u16 def_page_scan_int; __u16 def_page_scan_window; __u8 def_inq_scan_type; __u16 def_inq_scan_int; __u16 def_inq_scan_window; __u16 def_br_lsto; __u16 def_page_timeout; __u16 def_multi_adv_rotation_duration; __u16 def_le_autoconnect_timeout; __s8 min_le_tx_power; __s8 max_le_tx_power; __u16 pkt_type; __u16 esco_type; __u16 link_policy; __u16 link_mode; __u32 idle_timeout; __u16 sniff_min_interval; __u16 sniff_max_interval; unsigned int auto_accept_delay; unsigned long quirks; atomic_t cmd_cnt; unsigned int acl_cnt; unsigned int sco_cnt; unsigned int le_cnt; unsigned int iso_cnt; unsigned int acl_mtu; unsigned int sco_mtu; unsigned int le_mtu; unsigned int iso_mtu; unsigned int acl_pkts; unsigned int sco_pkts; unsigned int le_pkts; unsigned int iso_pkts; unsigned long acl_last_tx; unsigned long le_last_tx; __u8 le_tx_def_phys; __u8 le_rx_def_phys; struct workqueue_struct *workqueue; struct workqueue_struct *req_workqueue; struct work_struct power_on; struct delayed_work power_off; struct work_struct error_reset; struct work_struct cmd_sync_work; struct list_head cmd_sync_work_list; struct mutex cmd_sync_work_lock; struct mutex unregister_lock; struct work_struct cmd_sync_cancel_work; struct work_struct reenable_adv_work; __u16 discov_timeout; struct delayed_work discov_off; struct delayed_work service_cache; struct delayed_work cmd_timer; struct delayed_work ncmd_timer; struct work_struct rx_work; struct work_struct cmd_work; struct work_struct tx_work; struct delayed_work le_scan_disable; struct sk_buff_head rx_q; struct sk_buff_head raw_q; struct sk_buff_head cmd_q; struct sk_buff *sent_cmd; struct sk_buff *recv_event; struct mutex req_lock; wait_queue_head_t req_wait_q; __u32 req_status; __u32 req_result; struct sk_buff *req_skb; struct sk_buff *req_rsp; void *smp_data; void *smp_bredr_data; struct discovery_state discovery; bool discovery_paused; int advertising_old_state; bool advertising_paused; struct notifier_block suspend_notifier; enum suspended_state suspend_state_next; enum suspended_state suspend_state; bool scanning_paused; bool suspended; u8 wake_reason; bdaddr_t wake_addr; u8 wake_addr_type; struct hci_conn_hash conn_hash; struct list_head mesh_pending; struct list_head mgmt_pending; struct list_head reject_list; struct list_head accept_list; struct list_head uuids; struct list_head link_keys; struct list_head long_term_keys; struct list_head identity_resolving_keys; struct list_head remote_oob_data; struct list_head le_accept_list; struct list_head le_resolv_list; struct list_head le_conn_params; struct list_head pend_le_conns; struct list_head pend_le_reports; struct list_head blocked_keys; struct list_head local_codecs; struct hci_dev_stats stat; atomic_t promisc; const char *hw_info; const char *fw_info; struct dentry *debugfs; struct hci_devcoredump dump; struct device dev; struct rfkill *rfkill; DECLARE_BITMAP(dev_flags, __HCI_NUM_FLAGS); hci_conn_flags_t conn_flags; __s8 adv_tx_power; __u8 adv_data[HCI_MAX_EXT_AD_LENGTH]; __u8 adv_data_len; __u8 scan_rsp_data[HCI_MAX_EXT_AD_LENGTH]; __u8 scan_rsp_data_len; __u8 per_adv_data[HCI_MAX_PER_AD_LENGTH]; __u8 per_adv_data_len; struct list_head adv_instances; unsigned int adv_instance_cnt; __u8 cur_adv_instance; __u16 adv_instance_timeout; struct delayed_work adv_instance_expire; struct idr adv_monitors_idr; unsigned int adv_monitors_cnt; __u8 irk[16]; __u32 rpa_timeout; struct delayed_work rpa_expired; bdaddr_t rpa; struct delayed_work mesh_send_done; enum { INTERLEAVE_SCAN_NONE, INTERLEAVE_SCAN_NO_FILTER, INTERLEAVE_SCAN_ALLOWLIST } interleave_scan_state; struct delayed_work interleave_scan; struct list_head monitored_devices; bool advmon_pend_notify; #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; #endif #if IS_ENABLED(CONFIG_BT_MSFTEXT) __u16 msft_opcode; void *msft_data; bool msft_curve_validity; #endif #if IS_ENABLED(CONFIG_BT_AOSPEXT) bool aosp_capable; bool aosp_quality_report; #endif int (*open)(struct hci_dev *hdev); int (*close)(struct hci_dev *hdev); int (*flush)(struct hci_dev *hdev); int (*setup)(struct hci_dev *hdev); int (*shutdown)(struct hci_dev *hdev); int (*send)(struct hci_dev *hdev, struct sk_buff *skb); void (*notify)(struct hci_dev *hdev, unsigned int evt); void (*hw_error)(struct hci_dev *hdev, u8 code); int (*post_init)(struct hci_dev *hdev); int (*set_diag)(struct hci_dev *hdev, bool enable); int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr); void (*reset)(struct hci_dev *hdev); bool (*wakeup)(struct hci_dev *hdev); int (*set_quality_report)(struct hci_dev *hdev, bool enable); int (*get_data_path_id)(struct hci_dev *hdev, __u8 *data_path); int (*get_codec_config_data)(struct hci_dev *hdev, __u8 type, struct bt_codec *codec, __u8 *vnd_len, __u8 **vnd_data); u8 (*classify_pkt_type)(struct hci_dev *hdev, struct sk_buff *skb); }; #define HCI_PHY_HANDLE(handle) (handle & 0xff) enum conn_reasons { CONN_REASON_PAIR_DEVICE, CONN_REASON_L2CAP_CHAN, CONN_REASON_SCO_CONNECT, CONN_REASON_ISO_CONNECT, }; struct hci_conn { struct list_head list; atomic_t refcnt; bdaddr_t dst; __u8 dst_type; bdaddr_t src; __u8 src_type; bdaddr_t init_addr; __u8 init_addr_type; bdaddr_t resp_addr; __u8 resp_addr_type; __u8 adv_instance; __u16 handle; __u16 sync_handle; __u8 sid; __u16 state; __u16 mtu; __u8 mode; __u8 type; __u8 role; bool out; __u8 attempt; __u8 dev_class[3]; __u8 features[HCI_MAX_PAGES][8]; __u16 pkt_type; __u16 link_policy; __u8 key_type; __u8 auth_type; __u8 sec_level; __u8 pending_sec_level; __u8 pin_length; __u8 enc_key_size; __u8 io_capability; __u32 passkey_notify; __u8 passkey_entered; __u16 disc_timeout; __u16 conn_timeout; __u16 setting; __u16 auth_payload_timeout; __u16 le_conn_min_interval; __u16 le_conn_max_interval; __u16 le_conn_interval; __u16 le_conn_latency; __u16 le_supv_timeout; __u8 le_adv_data[HCI_MAX_EXT_AD_LENGTH]; __u8 le_adv_data_len; __u8 le_per_adv_data[HCI_MAX_PER_AD_TOT_LEN]; __u16 le_per_adv_data_len; __u16 le_per_adv_data_offset; __u8 le_adv_phy; __u8 le_adv_sec_phy; __u8 le_tx_phy; __u8 le_rx_phy; __s8 rssi; __s8 tx_power; __s8 max_tx_power; struct bt_iso_qos iso_qos; __u8 num_bis; __u8 bis[HCI_MAX_ISO_BIS]; unsigned long flags; enum conn_reasons conn_reason; __u8 abort_reason; __u32 clock; __u16 clock_accuracy; unsigned long conn_info_timestamp; __u8 remote_cap; __u8 remote_auth; __u8 remote_id; unsigned int sent; struct sk_buff_head data_q; struct list_head chan_list; struct tx_queue tx_q; struct delayed_work disc_work; struct delayed_work auto_accept_work; struct delayed_work idle_work; struct delayed_work le_conn_timeout; struct device dev; struct dentry *debugfs; struct hci_dev *hdev; void *l2cap_data; void *sco_data; void *iso_data; struct list_head link_list; struct hci_conn *parent; struct hci_link *link; struct bt_codec codec; void (*connect_cfm_cb) (struct hci_conn *conn, u8 status); void (*security_cfm_cb) (struct hci_conn *conn, u8 status); void (*disconn_cfm_cb) (struct hci_conn *conn, u8 reason); void (*cleanup)(struct hci_conn *conn); }; struct hci_link { struct list_head list; struct hci_conn *conn; }; struct hci_chan { struct list_head list; __u16 handle; struct hci_conn *conn; struct sk_buff_head data_q; unsigned int sent; __u8 state; }; struct hci_conn_params { struct list_head list; struct list_head action; bdaddr_t addr; u8 addr_type; u16 conn_min_interval; u16 conn_max_interval; u16 conn_latency; u16 supervision_timeout; enum { HCI_AUTO_CONN_DISABLED, HCI_AUTO_CONN_REPORT, HCI_AUTO_CONN_DIRECT, HCI_AUTO_CONN_ALWAYS, HCI_AUTO_CONN_LINK_LOSS, HCI_AUTO_CONN_EXPLICIT, } auto_connect; struct hci_conn *conn; bool explicit_connect; /* Accessed without hdev->lock: */ hci_conn_flags_t flags; u8 privacy_mode; }; extern struct list_head hci_dev_list; extern struct list_head hci_cb_list; extern rwlock_t hci_dev_list_lock; extern struct mutex hci_cb_list_lock; #define hci_dev_set_flag(hdev, nr) set_bit((nr), (hdev)->dev_flags) #define hci_dev_clear_flag(hdev, nr) clear_bit((nr), (hdev)->dev_flags) #define hci_dev_change_flag(hdev, nr) change_bit((nr), (hdev)->dev_flags) #define hci_dev_test_flag(hdev, nr) test_bit((nr), (hdev)->dev_flags) #define hci_dev_test_and_set_flag(hdev, nr) test_and_set_bit((nr), (hdev)->dev_flags) #define hci_dev_test_and_clear_flag(hdev, nr) test_and_clear_bit((nr), (hdev)->dev_flags) #define hci_dev_test_and_change_flag(hdev, nr) test_and_change_bit((nr), (hdev)->dev_flags) #define hci_dev_clear_volatile_flags(hdev) \ do { \ hci_dev_clear_flag(hdev, HCI_LE_SCAN); \ hci_dev_clear_flag(hdev, HCI_LE_ADV); \ hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\ hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); \ hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); \ } while (0) #define hci_dev_le_state_simultaneous(hdev) \ (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) && \ (hdev->le_states[4] & 0x08) && /* Central */ \ (hdev->le_states[4] & 0x40) && /* Peripheral */ \ (hdev->le_states[3] & 0x10)) /* Simultaneous */ /* ----- HCI interface to upper protocols ----- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr); int l2cap_disconn_ind(struct hci_conn *hcon); void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); #if IS_ENABLED(CONFIG_BT_BREDR) int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb); #else static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { return 0; } static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) { } #endif #if IS_ENABLED(CONFIG_BT_LE) int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); #else static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { return 0; } static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { } #endif /* ----- Inquiry cache ----- */ #define INQUIRY_CACHE_AGE_MAX (HZ*30) /* 30 seconds */ #define INQUIRY_ENTRY_AGE_MAX (HZ*60) /* 60 seconds */ static inline void discovery_init(struct hci_dev *hdev) { hdev->discovery.state = DISCOVERY_STOPPED; INIT_LIST_HEAD(&hdev->discovery.all); INIT_LIST_HEAD(&hdev->discovery.unknown); INIT_LIST_HEAD(&hdev->discovery.resolve); hdev->discovery.report_invalid_rssi = true; hdev->discovery.rssi = HCI_RSSI_INVALID; } static inline void hci_discovery_filter_clear(struct hci_dev *hdev) { hdev->discovery.result_filtering = false; hdev->discovery.report_invalid_rssi = true; hdev->discovery.rssi = HCI_RSSI_INVALID; hdev->discovery.uuid_count = 0; kfree(hdev->discovery.uuids); hdev->discovery.uuids = NULL; } bool hci_discovery_active(struct hci_dev *hdev); void hci_discovery_set_state(struct hci_dev *hdev, int state); static inline int inquiry_cache_empty(struct hci_dev *hdev) { return list_empty(&hdev->discovery.all); } static inline long inquiry_cache_age(struct hci_dev *hdev) { struct discovery_state *c = &hdev->discovery; return jiffies - c->timestamp; } static inline long inquiry_entry_age(struct inquiry_entry *e) { return jiffies - e->timestamp; } struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr); struct inquiry_entry *hci_inquiry_cache_lookup_unknown(struct hci_dev *hdev, bdaddr_t *bdaddr); struct inquiry_entry *hci_inquiry_cache_lookup_resolve(struct hci_dev *hdev, bdaddr_t *bdaddr, int state); void hci_inquiry_cache_update_resolve(struct hci_dev *hdev, struct inquiry_entry *ie); u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, bool name_known); void hci_inquiry_cache_flush(struct hci_dev *hdev); /* ----- HCI Connections ----- */ enum { HCI_CONN_AUTH_PEND, HCI_CONN_ENCRYPT_PEND, HCI_CONN_RSWITCH_PEND, HCI_CONN_MODE_CHANGE_PEND, HCI_CONN_SCO_SETUP_PEND, HCI_CONN_MGMT_CONNECTED, HCI_CONN_SSP_ENABLED, HCI_CONN_SC_ENABLED, HCI_CONN_AES_CCM, HCI_CONN_POWER_SAVE, HCI_CONN_FLUSH_KEY, HCI_CONN_ENCRYPT, HCI_CONN_AUTH, HCI_CONN_SECURE, HCI_CONN_FIPS, HCI_CONN_STK_ENCRYPT, HCI_CONN_AUTH_INITIATOR, HCI_CONN_DROP, HCI_CONN_CANCEL, HCI_CONN_PARAM_REMOVAL_PEND, HCI_CONN_NEW_LINK_KEY, HCI_CONN_SCANNING, HCI_CONN_AUTH_FAILURE, HCI_CONN_PER_ADV, HCI_CONN_BIG_CREATED, HCI_CONN_CREATE_CIS, HCI_CONN_CREATE_BIG_SYNC, HCI_CONN_BIG_SYNC, HCI_CONN_BIG_SYNC_FAILED, HCI_CONN_CREATE_PA_SYNC, HCI_CONN_PA_SYNC, HCI_CONN_PA_SYNC_FAILED, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; return hci_dev_test_flag(hdev, HCI_SSP_ENABLED) && test_bit(HCI_CONN_SSP_ENABLED, &conn->flags); } static inline bool hci_conn_sc_enabled(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; return hci_dev_test_flag(hdev, HCI_SC_ENABLED) && test_bit(HCI_CONN_SC_ENABLED, &conn->flags); } static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) { struct hci_conn_hash *h = &hdev->conn_hash; list_add_tail_rcu(&c->list, &h->list); switch (c->type) { case ACL_LINK: h->acl_num++; break; case LE_LINK: h->le_num++; if (c->role == HCI_ROLE_SLAVE) h->le_num_peripheral++; break; case SCO_LINK: case ESCO_LINK: h->sco_num++; break; case ISO_LINK: h->iso_num++; break; } } static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) { struct hci_conn_hash *h = &hdev->conn_hash; list_del_rcu(&c->list); synchronize_rcu(); switch (c->type) { case ACL_LINK: h->acl_num--; break; case LE_LINK: h->le_num--; if (c->role == HCI_ROLE_SLAVE) h->le_num_peripheral--; break; case SCO_LINK: case ESCO_LINK: h->sco_num--; break; case ISO_LINK: h->iso_num--; break; } } static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; switch (type) { case ACL_LINK: return h->acl_num; case LE_LINK: return h->le_num; case SCO_LINK: case ESCO_LINK: return h->sco_num; case ISO_LINK: return h->iso_num; default: return 0; } } static inline unsigned int hci_conn_count(struct hci_dev *hdev) { struct hci_conn_hash *c = &hdev->conn_hash; return c->acl_num + c->sco_num + c->le_num + c->iso_num; } static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c == conn) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; __u8 type = INVALID_LINK; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->handle == handle) { type = c->type; break; } } rcu_read_unlock(); return type; } static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, bdaddr_t *ba, __u8 bis) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (bacmp(&c->dst, ba) || c->type != ISO_LINK) continue; if (c->iso_qos.bcast.bis == bis) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev, __u8 sid, bdaddr_t *dst, __u8 dst_type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != ISO_LINK || bacmp(&c->dst, dst) || c->dst_type != dst_type || c->sid != sid) continue; rcu_read_unlock(); return c; } rcu_read_unlock(); return NULL; } static inline struct hci_conn * hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, bdaddr_t *ba, __u8 big, __u8 bis) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (bacmp(&c->dst, ba) || c->type != ISO_LINK || !test_bit(HCI_CONN_PER_ADV, &c->flags)) continue; if (c->iso_qos.bcast.big == big && c->iso_qos.bcast.bis == bis) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn *hci_conn_hash_lookup_handle(struct hci_dev *hdev, __u16 handle) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->handle == handle) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev, __u8 type, bdaddr_t *ba) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type == type && !bacmp(&c->dst, ba)) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev, bdaddr_t *ba, __u8 ba_type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != LE_LINK) continue; if (ba_type == c->dst_type && !bacmp(&c->dst, ba)) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev, bdaddr_t *ba, __u8 ba_type, __u8 cig, __u8 id) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) continue; /* Match CIG ID if set */ if (cig != c->iso_qos.ucast.cig) continue; /* Match CIS ID if set */ if (id != c->iso_qos.ucast.cis) continue; /* Match destination address if set */ if (!ba || (ba_type == c->dst_type && !bacmp(&c->dst, ba))) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev, __u8 handle) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) continue; if (handle == c->iso_qos.ucast.cig) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, __u8 handle) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != ISO_LINK) continue; /* An ISO_LINK hcon with BDADDR_ANY as destination * address is a Broadcast connection. A Broadcast * slave connection is associated with a PA train, * so the sync_handle can be used to differentiate * from unicast. */ if (bacmp(&c->dst, BDADDR_ANY) && c->sync_handle == HCI_SYNC_HANDLE_INVALID) continue; if (handle == c->iso_qos.bcast.big) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn * hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, __u8 handle, __u8 num_bis) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != ISO_LINK) continue; if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn * hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK || c->state != state) continue; if (handle == c->iso_qos.bcast.big) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn * hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != ISO_LINK || !test_bit(HCI_CONN_PA_SYNC, &c->flags)) continue; if (c->iso_qos.bcast.big == big) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn * hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != ISO_LINK) continue; /* Ignore the listen hcon, we are looking * for the child hcon that was created as * a result of the PA sync established event. */ if (c->state == BT_LISTEN) continue; if (c->sync_handle == sync_handle) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, __u8 type, __u16 state) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type == type && c->state == state) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } typedef void (*hci_conn_func_t)(struct hci_conn *conn, void *data); static inline void hci_conn_hash_list_state(struct hci_dev *hdev, hci_conn_func_t func, __u8 type, __u16 state, void *data) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; if (!func) return; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type == type && c->state == state) func(c, data); } rcu_read_unlock(); } static inline void hci_conn_hash_list_flag(struct hci_dev *hdev, hci_conn_func_t func, __u8 type, __u8 flag, void *data) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; if (!func) return; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type == type && test_bit(flag, &c->flags)) func(c, data); } rcu_read_unlock(); } static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type == LE_LINK && c->state == BT_CONNECT && !test_bit(HCI_CONN_SCANNING, &c->flags)) { rcu_read_unlock(); return c; } } rcu_read_unlock(); return NULL; } /* Returns true if an le connection is in the scanning state */ static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type == LE_LINK && c->state == BT_CONNECT && test_bit(HCI_CONN_SCANNING, &c->flags)) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } int hci_disconnect(struct hci_conn *conn, __u8 reason); bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); int hci_pa_create_sync_pending(struct hci_dev *hdev); int hci_le_big_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role, u16 handle); struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role); void hci_conn_del(struct hci_conn *conn); void hci_conn_hash_flush(struct hci_dev *hdev); struct hci_chan *hci_chan_create(struct hci_conn *conn); void hci_chan_del(struct hci_chan *chan); void hci_chan_list_flush(struct hci_conn *conn); struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle); struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout, enum conn_reasons conn_reason); struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, bool dst_resolved, u8 sec_level, u16 conn_timeout, u8 role, u8 phy, u8 sec_phy); void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status); struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, u8 sec_level, u8 auth_type, enum conn_reasons conn_reason, u16 timeout); struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, __u16 setting, struct bt_codec *codec, u16 timeout); struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos); struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, struct bt_iso_qos *qos, __u8 base_len, __u8 *base); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos); struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos, __u8 data_len, __u8 *data); struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, struct bt_iso_qos *qos, __u16 sync_handle, __u8 num_bis, __u8 bis[]); int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level); int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, bool initiator); int hci_conn_switch_role(struct hci_conn *conn, __u8 role); void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active); void hci_conn_failed(struct hci_conn *conn, u8 status); u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle); void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb); void hci_conn_tx_dequeue(struct hci_conn *conn); void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset, const struct sockcm_cookie *sockc); static inline void hci_sockcm_init(struct sockcm_cookie *sockc, struct sock *sk) { *sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags), }; } /* * hci_conn_get() and hci_conn_put() are used to control the life-time of an * "hci_conn" object. They do not guarantee that the hci_conn object is running, * working or anything else. They just guarantee that the object is available * and can be dereferenced. So you can use its locks, local variables and any * other constant data. * Before accessing runtime data, you _must_ lock the object and then check that * it is still running. As soon as you release the locks, the connection might * get dropped, though. * * On the other hand, hci_conn_hold() and hci_conn_drop() are used to control * how long the underlying connection is held. So every channel that runs on the * hci_conn object calls this to prevent the connection from disappearing. As * long as you hold a device, you must also guarantee that you have a valid * reference to the device via hci_conn_get() (or the initial reference from * hci_conn_add()). * The hold()/drop() ref-count is known to drop below 0 sometimes, which doesn't * break because nobody cares for that. But this means, we cannot use * _get()/_drop() in it, but require the caller to have a valid ref (FIXME). */ static inline struct hci_conn *hci_conn_get(struct hci_conn *conn) { get_device(&conn->dev); return conn; } static inline void hci_conn_put(struct hci_conn *conn) { put_device(&conn->dev); } static inline struct hci_conn *hci_conn_hold(struct hci_conn *conn) { BT_DBG("hcon %p orig refcnt %d", conn, atomic_read(&conn->refcnt)); atomic_inc(&conn->refcnt); cancel_delayed_work(&conn->disc_work); return conn; } static inline void hci_conn_drop(struct hci_conn *conn) { BT_DBG("hcon %p orig refcnt %d", conn, atomic_read(&conn->refcnt)); if (atomic_dec_and_test(&conn->refcnt)) { unsigned long timeo; switch (conn->type) { case ACL_LINK: case LE_LINK: cancel_delayed_work(&conn->idle_work); if (conn->state == BT_CONNECTED) { timeo = conn->disc_timeout; if (!conn->out) timeo *= 2; } else { timeo = 0; } break; default: timeo = 0; break; } cancel_delayed_work(&conn->disc_work); queue_delayed_work(conn->hdev->workqueue, &conn->disc_work, timeo); } } /* ----- HCI Devices ----- */ static inline void hci_dev_put(struct hci_dev *d) { BT_DBG("%s orig refcnt %d", d->name, kref_read(&d->dev.kobj.kref)); put_device(&d->dev); } static inline struct hci_dev *hci_dev_hold(struct hci_dev *d) { BT_DBG("%s orig refcnt %d", d->name, kref_read(&d->dev.kobj.kref)); get_device(&d->dev); return d; } #define hci_dev_lock(d) mutex_lock(&d->lock) #define hci_dev_unlock(d) mutex_unlock(&d->lock) #define to_hci_dev(d) container_of(d, struct hci_dev, dev) #define to_hci_conn(c) container_of(c, struct hci_conn, dev) static inline void *hci_get_drvdata(struct hci_dev *hdev) { return dev_get_drvdata(&hdev->dev); } static inline void hci_set_drvdata(struct hci_dev *hdev, void *data) { dev_set_drvdata(&hdev->dev, data); } static inline void *hci_get_priv(struct hci_dev *hdev) { return (char *)hdev + sizeof(*hdev); } struct hci_dev *hci_dev_get(int index); struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, u8 src_type); struct hci_dev *hci_alloc_dev_priv(int sizeof_priv); static inline struct hci_dev *hci_alloc_dev(void) { return hci_alloc_dev_priv(0); } void hci_free_dev(struct hci_dev *hdev); int hci_register_dev(struct hci_dev *hdev); void hci_unregister_dev(struct hci_dev *hdev); void hci_release_dev(struct hci_dev *hdev); int hci_register_suspend_notifier(struct hci_dev *hdev); int hci_unregister_suspend_notifier(struct hci_dev *hdev); int hci_suspend_dev(struct hci_dev *hdev); int hci_resume_dev(struct hci_dev *hdev); int hci_reset_dev(struct hci_dev *hdev); int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb); int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb); __printf(2, 3) void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...); __printf(2, 3) void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...); static inline void hci_set_msft_opcode(struct hci_dev *hdev, __u16 opcode) { #if IS_ENABLED(CONFIG_BT_MSFTEXT) hdev->msft_opcode = opcode; #endif } static inline void hci_set_aosp_capable(struct hci_dev *hdev) { #if IS_ENABLED(CONFIG_BT_AOSPEXT) hdev->aosp_capable = true; #endif } static inline void hci_devcd_setup(struct hci_dev *hdev) { #ifdef CONFIG_DEV_COREDUMP INIT_WORK(&hdev->dump.dump_rx, hci_devcd_rx); INIT_DELAYED_WORK(&hdev->dump.dump_timeout, hci_devcd_timeout); skb_queue_head_init(&hdev->dump.dump_q); #endif } int hci_dev_open(__u16 dev); int hci_dev_close(__u16 dev); int hci_dev_do_close(struct hci_dev *hdev); int hci_dev_reset(__u16 dev); int hci_dev_reset_stat(__u16 dev); int hci_dev_cmd(unsigned int cmd, void __user *arg); int hci_get_dev_list(void __user *arg); int hci_get_dev_info(void __user *arg); int hci_get_conn_list(void __user *arg); int hci_get_conn_info(struct hci_dev *hdev, void __user *arg); int hci_get_auth_info(struct hci_dev *hdev, void __user *arg); int hci_inquiry(void __user *arg); struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *list, bdaddr_t *bdaddr, u8 type); struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( struct list_head *list, bdaddr_t *bdaddr, u8 type); struct bdaddr_list_with_flags * hci_bdaddr_list_lookup_with_flags(struct list_head *list, bdaddr_t *bdaddr, u8 type); int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type); int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type, u8 *peer_irk, u8 *local_irk); int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, u8 type, u32 flags); int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type); int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type); void hci_bdaddr_list_clear(struct list_head *list); struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); void hci_conn_params_clear_disabled(struct hci_dev *hdev); void hci_conn_params_free(struct hci_conn_params *param); void hci_pend_le_list_del_init(struct hci_conn_params *param); void hci_pend_le_list_add(struct hci_conn_params *param, struct list_head *list); struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type); void hci_uuids_clear(struct hci_dev *hdev); void hci_link_keys_clear(struct hci_dev *hdev); struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr); struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len, bool *persistent); struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 type, u8 authenticated, u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand); struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 role); int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type); void hci_smp_ltks_clear(struct hci_dev *hdev); int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr); struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa); struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type); struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 val[16], bdaddr_t *rpa); void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type); bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]); void hci_blocked_keys_clear(struct hci_dev *hdev); void hci_smp_irks_clear(struct hci_dev *hdev); bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type); void hci_remote_oob_data_clear(struct hci_dev *hdev); struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type); int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type, u8 *hash192, u8 *rand192, u8 *hash256, u8 *rand256); int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type); void hci_adv_instances_clear(struct hci_dev *hdev); struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance); struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance); struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration, s8 tx_power, u32 min_interval, u32 max_interval, u8 mesh_handle); struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval); int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data); int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance); void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired); u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance); bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance); void hci_adv_monitors_clear(struct hci_dev *hdev); void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor); int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor); int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle); int hci_remove_all_adv_monitor(struct hci_dev *hdev); bool hci_is_adv_monitoring(struct hci_dev *hdev); int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev); void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb); void hci_init_sysfs(struct hci_dev *hdev); void hci_conn_init_sysfs(struct hci_conn *conn); void hci_conn_add_sysfs(struct hci_conn *conn); void hci_conn_del_sysfs(struct hci_conn *conn); #define SET_HCIDEV_DEV(hdev, pdev) ((hdev)->dev.parent = (pdev)) #define GET_HCIDEV_DEV(hdev) ((hdev)->dev.parent) /* ----- LMP capabilities ----- */ #define lmp_encrypt_capable(dev) ((dev)->features[0][0] & LMP_ENCRYPT) #define lmp_rswitch_capable(dev) ((dev)->features[0][0] & LMP_RSWITCH) #define lmp_hold_capable(dev) ((dev)->features[0][0] & LMP_HOLD) #define lmp_sniff_capable(dev) ((dev)->features[0][0] & LMP_SNIFF) #define lmp_park_capable(dev) ((dev)->features[0][1] & LMP_PARK) #define lmp_sco_capable(dev) ((dev)->features[0][1] & LMP_SCO) #define lmp_inq_rssi_capable(dev) ((dev)->features[0][3] & LMP_RSSI_INQ) #define lmp_esco_capable(dev) ((dev)->features[0][3] & LMP_ESCO) #define lmp_bredr_capable(dev) (!((dev)->features[0][4] & LMP_NO_BREDR)) #define lmp_le_capable(dev) ((dev)->features[0][4] & LMP_LE) #define lmp_sniffsubr_capable(dev) ((dev)->features[0][5] & LMP_SNIFF_SUBR) #define lmp_pause_enc_capable(dev) ((dev)->features[0][5] & LMP_PAUSE_ENC) #define lmp_esco_2m_capable(dev) ((dev)->features[0][5] & LMP_EDR_ESCO_2M) #define lmp_ext_inq_capable(dev) ((dev)->features[0][6] & LMP_EXT_INQ) #define lmp_le_br_capable(dev) (!!((dev)->features[0][6] & LMP_SIMUL_LE_BR)) #define lmp_ssp_capable(dev) ((dev)->features[0][6] & LMP_SIMPLE_PAIR) #define lmp_no_flush_capable(dev) ((dev)->features[0][6] & LMP_NO_FLUSH) #define lmp_lsto_capable(dev) ((dev)->features[0][7] & LMP_LSTO) #define lmp_inq_tx_pwr_capable(dev) ((dev)->features[0][7] & LMP_INQ_TX_PWR) #define lmp_ext_feat_capable(dev) ((dev)->features[0][7] & LMP_EXTFEATURES) #define lmp_transp_capable(dev) ((dev)->features[0][2] & LMP_TRANSPARENT) #define lmp_edr_2m_capable(dev) ((dev)->features[0][3] & LMP_EDR_2M) #define lmp_edr_3m_capable(dev) ((dev)->features[0][3] & LMP_EDR_3M) #define lmp_edr_3slot_capable(dev) ((dev)->features[0][4] & LMP_EDR_3SLOT) #define lmp_edr_5slot_capable(dev) ((dev)->features[0][5] & LMP_EDR_5SLOT) /* ----- Extended LMP capabilities ----- */ #define lmp_cpb_central_capable(dev) ((dev)->features[2][0] & LMP_CPB_CENTRAL) #define lmp_cpb_peripheral_capable(dev) ((dev)->features[2][0] & LMP_CPB_PERIPHERAL) #define lmp_sync_train_capable(dev) ((dev)->features[2][0] & LMP_SYNC_TRAIN) #define lmp_sync_scan_capable(dev) ((dev)->features[2][0] & LMP_SYNC_SCAN) #define lmp_sc_capable(dev) ((dev)->features[2][1] & LMP_SC) #define lmp_ping_capable(dev) ((dev)->features[2][1] & LMP_PING) /* ----- Host capabilities ----- */ #define lmp_host_ssp_capable(dev) ((dev)->features[1][0] & LMP_HOST_SSP) #define lmp_host_sc_capable(dev) ((dev)->features[1][0] & LMP_HOST_SC) #define lmp_host_le_capable(dev) (!!((dev)->features[1][0] & LMP_HOST_LE)) #define lmp_host_le_br_capable(dev) (!!((dev)->features[1][0] & LMP_HOST_LE_BREDR)) #define hdev_is_powered(dev) (test_bit(HCI_UP, &(dev)->flags) && \ !hci_dev_test_flag(dev, HCI_AUTO_OFF)) #define bredr_sc_enabled(dev) (lmp_sc_capable(dev) && \ hci_dev_test_flag(dev, HCI_SC_ENABLED)) #define rpa_valid(dev) (bacmp(&dev->rpa, BDADDR_ANY) && \ !hci_dev_test_flag(dev, HCI_RPA_EXPIRED)) #define adv_rpa_valid(adv) (bacmp(&adv->random_addr, BDADDR_ANY) && \ !adv->rpa_expired) #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) #define le_2m_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_2M)) #define scan_2m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_2M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) #define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \ !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \ &(dev)->quirks)) #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ (hdev->commands[39] & 0x04)) #define read_key_size_capable(dev) \ ((dev)->commands[20] & 0x10 && \ !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks)) #define read_voice_setting_capable(dev) \ ((dev)->commands[9] & 0x04 && \ !test_bit(HCI_QUIRK_BROKEN_READ_VOICE_SETTING, &(dev)->quirks)) /* Use enhanced synchronous connection if command is supported and its quirk * has not been set. */ #define enhanced_sync_conn_capable(dev) \ (((dev)->commands[29] & 0x08) && \ !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks)) /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40) && \ !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks)) /* Use ext create connection if command is supported */ #define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \ !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &(dev)->quirks)) /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) /* Maximum advertising length */ #define max_adv_len(dev) \ (ext_adv_capable(dev) ? HCI_MAX_EXT_AD_LENGTH : HCI_MAX_AD_LENGTH) /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 1789: * * C24: Mandatory if the LE Controller supports Connection State and either * LE Feature (LL Privacy) or LE Feature (Extended Advertising) is supported */ #define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \ ext_adv_capable(dev)) && \ !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, \ &(dev)->quirks)) /* Periodic advertising support */ #define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV)) /* CIS Master/Slave and BIS support */ #define iso_capable(dev) (cis_capable(dev) || bis_capable(dev)) #define cis_capable(dev) \ (cis_central_capable(dev) || cis_peripheral_capable(dev)) #define cis_central_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_CENTRAL) #define cis_peripheral_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) #define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type, __u8 *flags) { switch (type) { case ACL_LINK: return l2cap_connect_ind(hdev, bdaddr); case SCO_LINK: case ESCO_LINK: return sco_connect_ind(hdev, bdaddr, flags); case ISO_LINK: return iso_connect_ind(hdev, bdaddr, flags); default: BT_ERR("unknown link type %d", type); return -EINVAL; } } static inline int hci_proto_disconn_ind(struct hci_conn *conn) { if (conn->type != ACL_LINK && conn->type != LE_LINK) return HCI_ERROR_REMOTE_USER_TERM; return l2cap_disconn_ind(conn); } /* ----- HCI callbacks ----- */ struct hci_cb { struct list_head list; char *name; void (*connect_cfm) (struct hci_conn *conn, __u8 status); void (*disconn_cfm) (struct hci_conn *conn, __u8 status); void (*security_cfm) (struct hci_conn *conn, __u8 status, __u8 encrypt); void (*key_change_cfm) (struct hci_conn *conn, __u8 status); void (*role_switch_cfm) (struct hci_conn *conn, __u8 status, __u8 role); }; static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status) { struct hci_cb *cb; mutex_lock(&hci_cb_list_lock); list_for_each_entry(cb, &hci_cb_list, list) { if (cb->connect_cfm) cb->connect_cfm(conn, status); } mutex_unlock(&hci_cb_list_lock); if (conn->connect_cfm_cb) conn->connect_cfm_cb(conn, status); } static inline void hci_disconn_cfm(struct hci_conn *conn, __u8 reason) { struct hci_cb *cb; mutex_lock(&hci_cb_list_lock); list_for_each_entry(cb, &hci_cb_list, list) { if (cb->disconn_cfm) cb->disconn_cfm(conn, reason); } mutex_unlock(&hci_cb_list_lock); if (conn->disconn_cfm_cb) conn->disconn_cfm_cb(conn, reason); } static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status) { struct hci_cb *cb; __u8 encrypt; if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) return; encrypt = test_bit(HCI_CONN_ENCRYPT, &conn->flags) ? 0x01 : 0x00; mutex_lock(&hci_cb_list_lock); list_for_each_entry(cb, &hci_cb_list, list) { if (cb->security_cfm) cb->security_cfm(conn, status, encrypt); } mutex_unlock(&hci_cb_list_lock); if (conn->security_cfm_cb) conn->security_cfm_cb(conn, status); } static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status) { struct hci_cb *cb; __u8 encrypt; if (conn->state == BT_CONFIG) { if (!status) conn->state = BT_CONNECTED; hci_connect_cfm(conn, status); hci_conn_drop(conn); return; } if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) encrypt = 0x00; else if (test_bit(HCI_CONN_AES_CCM, &conn->flags)) encrypt = 0x02; else encrypt = 0x01; if (!status) { if (conn->sec_level == BT_SECURITY_SDP) conn->sec_level = BT_SECURITY_LOW; if (conn->pending_sec_level > conn->sec_level) conn->sec_level = conn->pending_sec_level; } mutex_lock(&hci_cb_list_lock); list_for_each_entry(cb, &hci_cb_list, list) { if (cb->security_cfm) cb->security_cfm(conn, status, encrypt); } mutex_unlock(&hci_cb_list_lock); if (conn->security_cfm_cb) conn->security_cfm_cb(conn, status); } static inline void hci_key_change_cfm(struct hci_conn *conn, __u8 status) { struct hci_cb *cb; mutex_lock(&hci_cb_list_lock); list_for_each_entry(cb, &hci_cb_list, list) { if (cb->key_change_cfm) cb->key_change_cfm(conn, status); } mutex_unlock(&hci_cb_list_lock); } static inline void hci_role_switch_cfm(struct hci_conn *conn, __u8 status, __u8 role) { struct hci_cb *cb; mutex_lock(&hci_cb_list_lock); list_for_each_entry(cb, &hci_cb_list, list) { if (cb->role_switch_cfm) cb->role_switch_cfm(conn, status, role); } mutex_unlock(&hci_cb_list_lock); } static inline bool hci_bdaddr_is_rpa(bdaddr_t *bdaddr, u8 addr_type) { if (addr_type != ADDR_LE_DEV_RANDOM) return false; if ((bdaddr->b[5] & 0xc0) == 0x40) return true; return false; } static inline bool hci_is_identity_address(bdaddr_t *addr, u8 addr_type) { if (addr_type == ADDR_LE_DEV_PUBLIC) return true; /* Check for Random Static address type */ if ((addr->b[5] & 0xc0) == 0xc0) return true; return false; } static inline struct smp_irk *hci_get_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { if (!hci_bdaddr_is_rpa(bdaddr, addr_type)) return NULL; return hci_find_irk_by_rpa(hdev, bdaddr); } static inline int hci_check_conn_params(u16 min, u16 max, u16 latency, u16 to_multiplier) { u16 max_latency; if (min > max) { BT_WARN("min %d > max %d", min, max); return -EINVAL; } if (min < 6) { BT_WARN("min %d < 6", min); return -EINVAL; } if (max > 3200) { BT_WARN("max %d > 3200", max); return -EINVAL; } if (to_multiplier < 10) { BT_WARN("to_multiplier %d < 10", to_multiplier); return -EINVAL; } if (to_multiplier > 3200) { BT_WARN("to_multiplier %d > 3200", to_multiplier); return -EINVAL; } if (max >= to_multiplier * 8) { BT_WARN("max %d >= to_multiplier %d * 8", max, to_multiplier); return -EINVAL; } max_latency = (to_multiplier * 4 / max) - 1; if (latency > 499) { BT_WARN("latency %d > 499", latency); return -EINVAL; } if (latency > max_latency) { BT_WARN("latency %d > max_latency %d", latency, max_latency); return -EINVAL; } return 0; } int hci_register_cb(struct hci_cb *hcb); int hci_unregister_cb(struct hci_cb *hcb); int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param); int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param); void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags); void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb); void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb); void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode); void *hci_recv_event_data(struct hci_dev *hdev, __u8 event); u32 hci_conn_get_phy(struct hci_conn *conn); /* ----- HCI Sockets ----- */ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb); void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk); void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb); void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event, void *data, u16 data_len, ktime_t tstamp, int flag, struct sock *skip_sk); void hci_sock_dev_event(struct hci_dev *hdev, int event); #define HCI_MGMT_VAR_LEN BIT(0) #define HCI_MGMT_NO_HDEV BIT(1) #define HCI_MGMT_UNTRUSTED BIT(2) #define HCI_MGMT_UNCONFIGURED BIT(3) #define HCI_MGMT_HDEV_OPTIONAL BIT(4) struct hci_mgmt_handler { int (*func) (struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len); size_t data_len; unsigned long flags; }; struct hci_mgmt_chan { struct list_head list; unsigned short channel; size_t handler_count; const struct hci_mgmt_handler *handlers; void (*hdev_init) (struct sock *sk, struct hci_dev *hdev); }; int hci_mgmt_chan_register(struct hci_mgmt_chan *c); void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c); /* Management interface */ #define DISCOV_TYPE_BREDR (BIT(BDADDR_BREDR)) #define DISCOV_TYPE_LE (BIT(BDADDR_LE_PUBLIC) | \ BIT(BDADDR_LE_RANDOM)) #define DISCOV_TYPE_INTERLEAVED (BIT(BDADDR_BREDR) | \ BIT(BDADDR_LE_PUBLIC) | \ BIT(BDADDR_LE_RANDOM)) /* These LE scan and inquiry parameters were chosen according to LE General * Discovery Procedure specification. */ #define DISCOV_LE_SCAN_WIN 0x0012 /* 11.25 msec */ #define DISCOV_LE_SCAN_INT 0x0012 /* 11.25 msec */ #define DISCOV_LE_SCAN_INT_FAST 0x0060 /* 60 msec */ #define DISCOV_LE_SCAN_WIN_FAST 0x0030 /* 30 msec */ #define DISCOV_LE_SCAN_INT_CONN 0x0060 /* 60 msec */ #define DISCOV_LE_SCAN_WIN_CONN 0x0060 /* 60 msec */ #define DISCOV_LE_SCAN_INT_SLOW1 0x0800 /* 1.28 sec */ #define DISCOV_LE_SCAN_WIN_SLOW1 0x0012 /* 11.25 msec */ #define DISCOV_LE_SCAN_INT_SLOW2 0x1000 /* 2.56 sec */ #define DISCOV_LE_SCAN_WIN_SLOW2 0x0024 /* 22.5 msec */ #define DISCOV_CODED_SCAN_INT_FAST 0x0120 /* 180 msec */ #define DISCOV_CODED_SCAN_WIN_FAST 0x0090 /* 90 msec */ #define DISCOV_CODED_SCAN_INT_SLOW1 0x1800 /* 3.84 sec */ #define DISCOV_CODED_SCAN_WIN_SLOW1 0x0036 /* 33.75 msec */ #define DISCOV_CODED_SCAN_INT_SLOW2 0x3000 /* 7.68 sec */ #define DISCOV_CODED_SCAN_WIN_SLOW2 0x006c /* 67.5 msec */ #define DISCOV_LE_TIMEOUT 10240 /* msec */ #define DISCOV_INTERLEAVED_TIMEOUT 5120 /* msec */ #define DISCOV_INTERLEAVED_INQUIRY_LEN 0x04 #define DISCOV_BREDR_INQUIRY_LEN 0x08 #define DISCOV_LE_RESTART_DELAY msecs_to_jiffies(200) /* msec */ #define DISCOV_LE_FAST_ADV_INT_MIN 0x00A0 /* 100 msec */ #define DISCOV_LE_FAST_ADV_INT_MAX 0x00F0 /* 150 msec */ #define DISCOV_LE_PER_ADV_INT_MIN 0x00A0 /* 200 msec */ #define DISCOV_LE_PER_ADV_INT_MAX 0x00A0 /* 200 msec */ #define DISCOV_LE_ADV_MESH_MIN 0x00A0 /* 100 msec */ #define DISCOV_LE_ADV_MESH_MAX 0x00A0 /* 100 msec */ #define INTERVAL_TO_MS(x) (((x) * 10) / 0x10) #define NAME_RESOLVE_DURATION msecs_to_jiffies(10240) /* 10.24 sec */ void mgmt_fill_version_info(void *ver); int mgmt_new_settings(struct hci_dev *hdev); void mgmt_index_added(struct hci_dev *hdev); void mgmt_index_removed(struct hci_dev *hdev); void mgmt_set_powered_failed(struct hci_dev *hdev, int err); void mgmt_power_on(struct hci_dev *hdev, int err); void __mgmt_power_off(struct hci_dev *hdev); void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, bool persistent); void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, u8 *name, u8 name_len); void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 reason, bool mgmt_connected); void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status); void mgmt_connect_failed(struct hci_dev *hdev, struct hci_conn *conn, u8 status); void mgmt_pin_code_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 secure); void mgmt_pin_code_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 status); void mgmt_pin_code_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 status); int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u32 value, u8 confirm_hint); int mgmt_user_confirm_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status); int mgmt_user_confirm_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status); int mgmt_user_passkey_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type); int mgmt_user_passkey_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status); int mgmt_user_passkey_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status); int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u32 passkey, u8 entered); void mgmt_auth_failed(struct hci_conn *conn, u8 status); void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status); void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, u8 status); void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status); void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len, u64 instant); void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, s8 rssi, u8 *name, u8 name_len); void mgmt_discovering(struct hci_dev *hdev, u8 discovering); void mgmt_suspending(struct hci_dev *hdev, u8 state); void mgmt_resuming(struct hci_dev *hdev, u8 reason, bdaddr_t *bdaddr, u8 addr_type); bool mgmt_powering_down(struct hci_dev *hdev); void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent); void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk, bool persistent); void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, bool persistent); void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type, u8 store_hint, u16 min_interval, u16 max_interval, u16 latency, u16 timeout); void mgmt_smp_complete(struct hci_conn *conn, bool complete); bool mgmt_get_connectable(struct hci_dev *hdev); u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev); void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle); int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, bdaddr_t *bdaddr, u8 addr_type); int hci_abort_conn(struct hci_conn *conn, u8 reason); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, __u8 ltk[16], __u8 key_size); void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *bdaddr_type); #define SCO_AIRMODE_MASK 0x0003 #define SCO_AIRMODE_CVSD 0x0000 #define SCO_AIRMODE_TRANSP 0x0003 #define LOCAL_CODEC_ACL_MASK BIT(0) #define LOCAL_CODEC_SCO_MASK BIT(1) #define TRANSPORT_TYPE_MAX 0x04 #endif /* __HCI_CORE_H */
9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 // SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 2012 Taobao. * Written by Tao Ma <boyu.mt@taobao.com> */ #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/namei.h> #include <linux/iversion.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "truncate.h" #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, void **fsdata); static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) return EXT4_I(inode)->i_inline_size; return 0; } static int get_max_inline_xattr_value_size(struct inode *inode, struct ext4_iloc *iloc) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; void *end; int free, min_offs; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return 0; min_offs = EXT4_SB(inode->i_sb)->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE - EXT4_I(inode)->i_extra_isize - sizeof(struct ext4_xattr_ibody_header); /* * We need to subtract another sizeof(__u32) since an in-inode xattr * needs an empty 4 bytes to indicate the gap between the xattr entry * and the name/value pair. */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return EXT4_XATTR_SIZE(min_offs - EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - EXT4_XATTR_ROUND - sizeof(__u32)); raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; /* Compute min_offs. */ while (!IS_LAST_ENTRY(entry)) { void *next = EXT4_XATTR_NEXT(entry); if (next >= end) { EXT4_ERROR_INODE(inode, "corrupt xattr in inline inode"); return 0; } if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; } entry = next; } free = min_offs - ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); if (EXT4_I(inode)->i_inline_off) { entry = (struct ext4_xattr_entry *) ((void *)raw_inode + EXT4_I(inode)->i_inline_off); free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); goto out; } free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); if (free > EXT4_XATTR_ROUND) free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); else free = 0; out: return free; } /* * Get the maximum size we now can store in an inode. * If we can't find the space for a xattr entry, don't use the space * of the extents since we have no space to indicate the inline data. */ int ext4_get_max_inline_size(struct inode *inode) { int error, max_inline_size; struct ext4_iloc iloc; if (EXT4_I(inode)->i_extra_isize == 0) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) { ext4_error_inode_err(inode, __func__, __LINE__, 0, -error, "can't get inode location %lu", inode->i_ino); return 0; } down_read(&EXT4_I(inode)->xattr_sem); max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); up_read(&EXT4_I(inode)->xattr_sem); brelse(iloc.bh); if (!max_inline_size) return 0; return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; } /* * this function does not take xattr_sem, which is OK because it is * currently only used in a code path coming form ext4_iget, before * the new inode has been unlocked */ int ext4_find_inline_data_nolock(struct inode *inode) { struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; int error; if (EXT4_I(inode)->i_extra_isize == 0) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; if (!is.s.not_found) { if (is.s.here->e_value_inum) { EXT4_ERROR_INODE(inode, "inline data xattr refers " "to an external xattr inode"); error = -EFSCORRUPTED; goto out; } EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + le32_to_cpu(is.s.here->e_value_size); } out: brelse(is.iloc.bh); return error; } static int ext4_read_inline_data(struct inode *inode, void *buffer, unsigned int len, struct ext4_iloc *iloc) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; int cp_len = 0; struct ext4_inode *raw_inode; if (!len) return 0; BUG_ON(len > EXT4_I(inode)->i_inline_size); cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE); raw_inode = ext4_raw_inode(iloc); memcpy(buffer, (void *)(raw_inode->i_block), cp_len); len -= cp_len; buffer += cp_len; if (!len) goto out; header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); len = min_t(unsigned int, len, (unsigned int)le32_to_cpu(entry->e_value_size)); memcpy(buffer, (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); cp_len += len; out: return cp_len; } /* * write the buffer to the inline inode. * If 'create' is set, we don't need to do the extra copy in the xattr * value since it is already handled by ext4_xattr_ibody_set. * That saves us one memcpy. */ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, void *buffer, loff_t pos, unsigned int len) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; int cp_len = 0; if (unlikely(ext4_emergency_state(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); raw_inode = ext4_raw_inode(iloc); buffer += pos; if (pos < EXT4_MIN_INLINE_DATA_SIZE) { cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? EXT4_MIN_INLINE_DATA_SIZE - pos : len; memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); len -= cp_len; buffer += cp_len; pos += cp_len; } if (!len) return; pos -= EXT4_MIN_INLINE_DATA_SIZE; header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, buffer, len); } static int ext4_create_inline_data(handle_t *handle, struct inode *inode, unsigned len) { int error; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; if (len > EXT4_MIN_INLINE_DATA_SIZE) { value = EXT4_ZERO_XATTR_VALUE; len -= EXT4_MIN_INLINE_DATA_SIZE; } else { value = ""; len = 0; } /* Insert the xttr entry. */ i.value = value; i.value_len = len; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUG_ON(!is.s.not_found); error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) { if (error == -ENOSPC) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); goto out; } memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); out: brelse(is.iloc.bh); return error; } static int ext4_update_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { int error; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; /* If the old space is ok, write the data directly. */ if (len <= EXT4_I(inode)->i_inline_size) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUG_ON(is.s.not_found); len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); if (!value) { error = -ENOMEM; goto out; } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); if (error < 0) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; /* Update the xattr entry. */ i.value = value; i.value_len = len; error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + le32_to_cpu(is.s.here->e_value_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); out: kfree(value); brelse(is.iloc.bh); return error; } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return -ENOSPC; size = ext4_get_max_inline_size(inode); if (size < len) return -ENOSPC; ext4_write_lock_xattr(inode, &no_expand); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else ret = ext4_create_inline_data(handle, inode, len); ext4_write_unlock_xattr(inode, &no_expand); return ret; } static int ext4_destroy_inline_data_nolock(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_xattr_ibody_find is = { .s = { .not_found = 0, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, .value = NULL, .value_len = 0, }; int error; if (!ei->i_inline_off) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE); if (ext4_has_feature_extents(inode->i_sb)) { if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); EXT4_I(inode)->i_inline_off = 0; EXT4_I(inode)->i_inline_size = 0; ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); out: brelse(is.iloc.bh); if (error == -ENODATA) error = 0; return error; } static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) { void *kaddr; int ret = 0; size_t len; struct ext4_iloc iloc; BUG_ON(!folio_test_locked(folio)); BUG_ON(!ext4_has_inline_data(inode)); BUG_ON(folio->index); if (!EXT4_I(inode)->i_inline_off) { ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", inode->i_ino); goto out; } ret = ext4_get_inode_loc(inode, &iloc); if (ret) goto out; len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); BUG_ON(len > PAGE_SIZE); kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); kaddr = folio_zero_tail(folio, len, kaddr + len); kunmap_local(kaddr); folio_mark_uptodate(folio); brelse(iloc.bh); out: return ret; } int ext4_readpage_inline(struct inode *inode, struct folio *folio) { int ret = 0; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); return -EAGAIN; } /* * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. */ if (!folio->index) ret = ext4_read_inline_folio(inode, folio); else if (!folio_test_uptodate(folio)) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); } up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); return ret >= 0 ? 0 : ret; } static int ext4_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode) { int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; struct folio *folio = NULL; unsigned from, to; struct ext4_iloc iloc; if (!ext4_has_inline_data(inode)) { /* * clear the flag so that no new write * will trap here again. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; goto out; } /* We cannot recurse into the filesystem as the transaction is already * started */ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto out_nofolio; } ext4_write_lock_xattr(inode, &no_expand); sem_held = 1; /* If some one has already done this for us, just exit. */ if (!ext4_has_inline_data(inode)) { ret = 0; goto out; } from = 0; to = ext4_get_inline_size(inode); if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; if (ext4_should_dioread_nolock(inode)) { ret = ext4_block_write_begin(handle, folio, from, to, ext4_get_block_unwritten); } else ret = ext4_block_write_begin(handle, folio, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, NULL, do_journal_get_write_access); } if (ret) { folio_unlock(folio); folio_put(folio); folio = NULL; ext4_orphan_add(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; ext4_journal_stop(handle); handle = NULL; ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; if (folio) block_commit_write(folio, from, to); out: if (folio) { folio_unlock(folio); folio_put(folio); } out_nofolio: if (sem_held) ext4_write_unlock_xattr(inode, &no_expand); if (handle) ext4_journal_stop(handle); brelse(iloc.bh); return ret; } /* * Prepare the write for the inline data. * If the data can be written into the inode, we just read * the page and make it uptodate, and start the journal. * Otherwise read the page, makes it dirty so that it can be * handle in writepages(the i_disksize update is left to the * normal ext4_da_write_end). */ int ext4_generic_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct folio **foliop, void **fsdata, bool da) { int ret; handle_t *handle; struct folio *folio; struct ext4_iloc iloc; int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_release_bh; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out_stop_journal; if (ret == -ENOSPC) { ext4_journal_stop(handle); if (!da) { brelse(iloc.bh); /* Retry inside */ return ext4_convert_inline_data_to_extent(mapping, inode); } ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; goto out_release_bh; } folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto out_stop_journal; } down_read(&EXT4_I(inode)->xattr_sem); /* Someone else had converted it to extent */ if (!ext4_has_inline_data(inode)) { ret = 0; goto out_release_folio; } if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out_release_folio; } ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (ret) goto out_release_folio; *foliop = folio; up_read(&EXT4_I(inode)->xattr_sem); brelse(iloc.bh); return 1; out_release_folio: up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); folio_put(folio); out_stop_journal: ext4_journal_stop(handle); out_release_bh: brelse(iloc.bh); return ret; } /* * Try to write data in the inode. * If the inode has inline data, check whether the new write can be * in the inode also. If not, create the page the handle, move the data * to the page make it update and let the later codes create extent for it. */ int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct folio **foliop) { if (pos + len > ext4_get_max_inline_size(inode)) return ext4_convert_inline_data_to_extent(mapping, inode); return ext4_generic_write_inline_data(mapping, inode, pos, len, foliop, NULL, false); } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio) { handle_t *handle = ext4_journal_current_handle(); int no_expand; void *kaddr; struct ext4_iloc iloc; int ret = 0, ret2; if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; if (likely(copied)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) { folio_unlock(folio); folio_put(folio); ext4_std_error(inode->i_sb, ret); goto out; } ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); /* * ei->i_inline_off may have changed since * ext4_write_begin() called * ext4_try_to_write_inline_data() */ (void) ext4_find_inline_data_nolock(inode); kaddr = kmap_local_folio(folio, 0); ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); kunmap_local(kaddr); folio_mark_uptodate(folio); /* clear dirty flag so that writepages wouldn't work for us. */ folio_clear_dirty(folio); ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); /* * It's important to update i_size while still holding folio * lock: page writeout could otherwise come in and zero * beyond i_size. */ ext4_update_inode_size(inode, pos + copied); } folio_unlock(folio); folio_put(folio); /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock * ordering of folio lock and transaction start for journaling * filesystems. */ if (likely(copied)) mark_inode_dirty(inode); out: /* * If we didn't copy as much data as expected, we need to trim back * size of xattr containing inline data. */ if (pos + len > inode->i_size && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Try to make the page cache and handle ready for the inline data case. * We can call this function in 2 cases: * 1. The inode is created and the first write exceeds inline size. We can * clear the inode state safely. * 2. The inode has inline data, then we need to read the data, make it * update and dirty so that ext4_da_writepages can handle it. We don't * need to start the journal since the file's metadata isn't changed now. */ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, void **fsdata) { int ret = 0, inline_size; struct folio *folio; folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); goto out; } inline_size = ext4_get_inline_size(inode); if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } ret = ext4_block_write_begin(NULL, folio, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); folio_put(folio); ext4_truncate_failed_write(inode); return ret; } folio_mark_dirty(folio); folio_mark_uptodate(folio); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); *fsdata = (void *)CONVERT_INLINE_DATA; out: up_read(&EXT4_I(inode)->xattr_sem); if (folio) { folio_unlock(folio); folio_put(folio); } return ret; } #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) { int offset; unsigned short de_len; struct ext4_dir_entry_2 *de = inline_start; void *dlimit = inline_start + inline_size; trace_printk("inode %lu\n", dir->i_ino); offset = 0; while ((void *)de < dlimit) { de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n", offset, de_len, de->name_len, de->name, de->name_len, le32_to_cpu(de->inode)); if (ext4_check_dir_entry(dir, NULL, de, bh, inline_start, inline_size, offset)) BUG(); offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } } #else #define ext4_show_inline_dir(dir, bh, inline_start, inline_size) #endif /* * Add a new entry into a inline dir. * It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { int err; struct ext4_dir_entry_2 *de; err = ext4_find_dest_de(dir, iloc->bh, inline_start, inline_size, fname, &de); if (err) return err; BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh, EXT4_JTR_NONE); if (err) return err; ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend * on this. * * XXX similarly, too many callers depend on * ext4_new_inode() setting the times, but error * recovery deletes the inode, so the worst that can * happen is that the times are slightly out of date * and/or different from the directory change time. */ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; } static void *ext4_get_inline_xattr_pos(struct inode *inode, struct ext4_iloc *iloc) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; BUG_ON(!EXT4_I(inode)->i_inline_off); header = IHDR(inode, ext4_raw_inode(iloc)); entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + EXT4_I(inode)->i_inline_off); return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); } /* Set the final de to cover the whole block. */ static void ext4_update_final_de(void *de_buf, int old_size, int new_size) { struct ext4_dir_entry_2 *de, *prev_de; void *limit; int de_len; de = de_buf; if (old_size) { limit = de_buf + old_size; do { prev_de = de; de_len = ext4_rec_len_from_disk(de->rec_len, old_size); de_buf += de_len; de = de_buf; } while (de_buf < limit); prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - old_size, new_size); } else { /* this is just created, so create an empty entry. */ de->inode = 0; de->rec_len = ext4_rec_len_to_disk(new_size, new_size); } } static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, struct ext4_iloc *iloc) { int ret; int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; int new_size = get_max_inline_xattr_value_size(dir, iloc); if (new_size - old_size <= ext4_dir_rec_len(1, NULL)) return -ENOSPC; ret = ext4_update_inline_data(handle, dir, new_size + EXT4_MIN_INLINE_DATA_SIZE); if (ret) return ret; ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE); dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; return 0; } static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { int ret; ret = ext4_create_inline_data(handle, inode, inline_size); if (ret) { ext4_msg(inode->i_sb, KERN_EMERG, "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", inode->i_ino, ret); return; } ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } static int ext4_finish_convert_inline_dir(handle_t *handle, struct inode *inode, struct buffer_head *dir_block, void *buf, int inline_size) { int err, csum_size = 0, header_size = 0; struct ext4_dir_entry_2 *de; void *target = dir_block->b_data; /* * First create "." and ".." and then copy the dir information * back to the block. */ de = target; de = ext4_init_dot_dotdot(inode, de, inode->i_sb->s_blocksize, csum_size, le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); header_size = (void *)de - target; memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; i_size_write(inode, inode->i_sb->s_blocksize); EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; ext4_update_final_de(dir_block->b_data, inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, inode->i_sb->s_blocksize - csum_size); if (csum_size) ext4_initialize_dirent_tail(dir_block, inode->i_sb->s_blocksize); set_buffer_uptodate(dir_block); unlock_buffer(dir_block); err = ext4_handle_dirty_dirblock(handle, inode, dir_block); if (err) return err; set_buffer_verified(dir_block); return ext4_mark_inode_dirty(handle, inode); } static int ext4_convert_inline_data_nolock(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int error; void *buf = NULL; struct buffer_head *data_bh = NULL; struct ext4_map_blocks map; int inline_size; inline_size = ext4_get_inline_size(inode); buf = kmalloc(inline_size, GFP_NOFS); if (!buf) { error = -ENOMEM; goto out; } error = ext4_read_inline_data(inode, buf, inline_size, iloc); if (error < 0) goto out; /* * Make sure the inline directory entries pass checks before we try to * convert them, so that we avoid touching stuff that needs fsck. */ if (S_ISDIR(inode->i_mode)) { error = ext4_check_all_de(inode, iloc->bh, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); if (error) goto out; } error = ext4_destroy_inline_data_nolock(handle, inode); if (error) goto out; map.m_lblk = 0; map.m_len = 1; map.m_flags = 0; error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); if (error < 0) goto out_restore; if (!(map.m_flags & EXT4_MAP_MAPPED)) { error = -EIO; goto out_restore; } data_bh = sb_getblk(inode->i_sb, map.m_pblk); if (!data_bh) { error = -ENOMEM; goto out_restore; } lock_buffer(data_bh); error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(data_bh); error = -EIO; goto out_restore; } memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); if (!S_ISDIR(inode->i_mode)) { memcpy(data_bh->b_data, buf, inline_size); set_buffer_uptodate(data_bh); unlock_buffer(data_bh); error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { error = ext4_finish_convert_inline_dir(handle, inode, data_bh, buf, inline_size); } out_restore: if (error) ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); out: brelse(data_bh); kfree(buf); return error; } /* * Try to add the new entry to the inline data. * If succeeds, return 0. If not, extended the inline dir and copied data to * the new created block. */ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { int ret, ret2, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; ret = ext4_get_inode_loc(dir, &iloc); if (ret) return ret; ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) goto out; inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; /* check whether it can be inserted to inline xattr space. */ inline_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; if (!inline_size) { /* Try to use the xattr space.*/ ret = ext4_update_inline_dir(handle, dir, &iloc); if (ret && ret != -ENOSPC) goto out; inline_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; } if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; } /* * The inline space is filled up, so create a new block for it. * As the extent tree will be created, we have to save the inline * dir first. */ ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); out: ext4_write_unlock_xattr(dir, &no_expand); ret2 = ext4_mark_inode_dirty(handle, dir); if (unlikely(ret2 && !ret)) ret = ret2; brelse(iloc.bh); return ret; } /* * This function fills a red-black tree with information from an * inlined dir. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ int ext4_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data) { int err = 0, count = 0; unsigned int parent_ino; int pos; struct ext4_dir_entry_2 *de; struct inode *inode = file_inode(dir_file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; struct fscrypt_str tmp_str; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; pos = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); while (pos < inline_size) { /* * As inlined dir doesn't store any information about '.' and * only the inode number of '..' is stored, we have to handle * them differently. */ if (pos == 0) { fake.inode = cpu_to_le32(inode->i_ino); fake.name_len = 1; strcpy(fake.name, "."); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_OFFSET; } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { fake.inode = cpu_to_le32(parent_ino); fake.name_len = 2; strcpy(fake.name, ".."); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_SIZE; } else { de = (struct ext4_dir_entry_2 *)(dir_buf + pos); pos += ext4_rec_len_from_disk(de->rec_len, inline_size); if (ext4_check_dir_entry(inode, dir_file, de, iloc.bh, dir_buf, inline_size, pos)) { ret = count; goto out; } } if (ext4_hash_in_dirent(dir)) { hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo); if (err) { ret = err; goto out; } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); if (err) { ret = err; goto out; } count++; } ret = count; out: kfree(dir_buf); brelse(iloc.bh); return ret; } /* * So this function is called when the volume is mkfsed with * dir_index disabled. In order to keep f_pos persistent * after we convert from an inlined dir to a blocked based, * we just pretend that we are a normal dir and return the * offset as if '.' and '..' really take place. * */ int ext4_read_inline_dir(struct file *file, struct dir_context *ctx, int *has_inline_data) { unsigned int offset, parent_ino; int i; struct ext4_dir_entry_2 *de; struct super_block *sb; struct inode *inode = file_inode(file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; int dotdot_offset, dotdot_size, extra_offset, extra_size; struct dir_private_info *info = file->private_data; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; ret = 0; sb = inode->i_sb; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); offset = ctx->pos; /* * dotdot_offset and dotdot_size is the real offset and * size for ".." and "." if the dir is block based while * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. * So we will use extra_offset and extra_size to indicate them * during the inline dir iteration. */ dotdot_offset = ext4_dir_rec_len(1, NULL); dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL); extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; extra_size = extra_offset + inline_size; /* * If the cookie has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the inline * dir to make sure. */ if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and * ".." is dotdot_offset. */ if (!i) { i = dotdot_offset; continue; } else if (i == dotdot_offset) { i = dotdot_size; continue; } /* for other entry, the real offset in * the buf has to be tuned accordingly. */ de = (struct ext4_dir_entry_2 *) (dir_buf + i - extra_offset); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, extra_size) < ext4_dir_rec_len(1, NULL)) break; i += ext4_rec_len_from_disk(de->rec_len, extra_size); } offset = i; ctx->pos = offset; info->cookie = inode_query_iversion(inode); } while (ctx->pos < extra_size) { if (ctx->pos == 0) { if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) goto out; ctx->pos = dotdot_offset; continue; } if (ctx->pos == dotdot_offset) { if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) goto out; ctx->pos = dotdot_size; continue; } de = (struct ext4_dir_entry_2 *) (dir_buf + ctx->pos - extra_offset); if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, extra_size, ctx->pos)) goto out; if (le32_to_cpu(de->inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto out; } ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size); } out: kfree(dir_buf); brelse(iloc.bh); return ret; } void *ext4_read_inline_link(struct inode *inode) { struct ext4_iloc iloc; int ret, inline_size; void *link; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ERR_PTR(ret); ret = -ENOMEM; inline_size = ext4_get_inline_size(inode); link = kmalloc(inline_size + 1, GFP_NOFS); if (!link) goto out; ret = ext4_read_inline_data(inode, link, inline_size, &iloc); if (ret < 0) { kfree(link); goto out; } nd_terminate_link(link, inode->i_size, ret); out: if (ret < 0) link = ERR_PTR(ret); brelse(iloc.bh); return link; } struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval) { struct ext4_iloc iloc; *retval = ext4_get_inode_loc(inode, &iloc); if (*retval) return NULL; *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; return iloc.bh; } /* * Try to create the inline data for the new dir. * If it succeeds, return 0, otherwise return the error. * In case of ENOSPC, the caller should create the normal disk layout dir. */ int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode) { int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; struct ext4_iloc iloc; struct ext4_dir_entry_2 *de; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; ret = ext4_prepare_inline_data(handle, inode, inline_size); if (ret) goto out; /* * For inline dir, we only save the inode information for the ".." * and create a fake dentry to cover the left space. */ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; de->inode = cpu_to_le32(parent->i_ino); de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); de->inode = 0; de->rec_len = ext4_rec_len_to_disk( inline_size - EXT4_INLINE_DOTDOT_SIZE, inline_size); set_nlink(inode, 2); inode->i_size = EXT4_I(inode)->i_disksize = inline_size; out: brelse(iloc.bh); return ret; } struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) { struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; int ret; void *inline_start; int inline_size; ret = ext4_get_inode_loc(dir, &is.iloc); if (ret) return ERR_PTR(ret); down_read(&EXT4_I(dir)->xattr_sem); ret = ext4_xattr_ibody_find(dir, &i, &is); if (ret) goto out; if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } inline_start = (void *)ext4_raw_inode(&is.iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) goto out; if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) goto out; inline_start = ext4_get_inline_xattr_pos(dir, &is.iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; out: brelse(is.iloc.bh); if (ret < 0) is.iloc.bh = ERR_PTR(ret); else is.iloc.bh = NULL; out_find: up_read(&EXT4_I(dir)->xattr_sem); return is.iloc.bh; } int ext4_delete_inline_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data) { int err, inline_size, no_expand; struct ext4_iloc iloc; void *inline_start; err = ext4_get_inode_loc(dir, &iloc); if (err) return err; ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < EXT4_MIN_INLINE_DATA_SIZE) { inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; } else { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; err = ext4_generic_delete_entry(dir, de_del, bh, inline_start, inline_size, 0); if (err) goto out; ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); out: ext4_write_unlock_xattr(dir, &no_expand); if (likely(err == 0)) err = ext4_mark_inode_dirty(handle, dir); brelse(iloc.bh); if (err != -ENOENT) ext4_std_error(dir->i_sb, err); return err; } /* * Get the inline dentry at offset. */ static inline struct ext4_dir_entry_2 * ext4_get_inline_entry(struct inode *inode, struct ext4_iloc *iloc, unsigned int offset, void **inline_start, int *inline_size) { void *inline_pos; BUG_ON(offset > ext4_get_inline_size(inode)); if (offset < EXT4_MIN_INLINE_DATA_SIZE) { inline_pos = (void *)ext4_raw_inode(iloc)->i_block; *inline_size = EXT4_MIN_INLINE_DATA_SIZE; } else { inline_pos = ext4_get_inline_xattr_pos(inode, iloc); offset -= EXT4_MIN_INLINE_DATA_SIZE; *inline_size = ext4_get_inline_size(inode) - EXT4_MIN_INLINE_DATA_SIZE; } if (inline_start) *inline_start = inline_pos; return (struct ext4_dir_entry_2 *)(inline_pos + offset); } bool empty_inline_dir(struct inode *dir, int *has_inline_data) { int err, inline_size; struct ext4_iloc iloc; size_t inline_len; void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; bool ret = false; err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE_ERR(dir, -err, "error %d getting inode %lu block", err, dir->i_ino); return false; } down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; ret = true; goto out; } de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; if (!le32_to_cpu(de->inode)) { ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - no `..'", dir->i_ino); goto out; } inline_len = ext4_get_inline_size(dir); offset = EXT4_INLINE_DOTDOT_SIZE; while (offset < inline_len) { de = ext4_get_inline_entry(dir, &iloc, offset, &inline_pos, &inline_size); if (ext4_check_dir_entry(dir, NULL, de, iloc.bh, inline_pos, inline_size, offset)) { ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - " "inode %u, rec_len %u, name_len %d" "inline size %d", dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); goto out; } if (le32_to_cpu(de->inode)) { goto out; } offset += ext4_rec_len_from_disk(de->rec_len, inline_size); } ret = true; out: up_read(&EXT4_I(dir)->xattr_sem); brelse(iloc.bh); return ret; } int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret, no_expand; ext4_write_lock_xattr(inode, &no_expand); ret = ext4_destroy_inline_data_nolock(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); return ret; } int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap) { __u64 addr; int error = -EAGAIN; struct ext4_iloc iloc; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) goto out; error = ext4_get_inode_loc(inode, &iloc); if (error) goto out; addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; addr += offsetof(struct ext4_inode, i_block); brelse(iloc.bh); iomap->addr = addr; iomap->offset = 0; iomap->length = min_t(loff_t, ext4_get_inline_size(inode), i_size_read(inode)); iomap->type = IOMAP_INLINE; iomap->flags = 0; out: up_read(&EXT4_I(inode)->xattr_sem); return error; } int ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; int inline_size, value_len, needed_blocks, no_expand, err = 0; size_t i_size; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; needed_blocks = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { ext4_write_unlock_xattr(inode, &no_expand); *has_inline = 0; ext4_journal_stop(handle); return 0; } if ((err = ext4_orphan_add(handle, inode)) != 0) goto out; if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0) goto out; down_write(&EXT4_I(inode)->i_data_sem); i_size = inode->i_size; inline_size = ext4_get_inline_size(inode); EXT4_I(inode)->i_disksize = i_size; if (i_size < inline_size) { /* * if there's inline data to truncate and this file was * converted to extents after that inline data was written, * the extent status cache must be cleared to avoid leaving * behind stale delayed allocated extent entries */ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) goto out_error; BUG_ON(is.s.not_found); value_len = le32_to_cpu(is.s.here->e_value_size); value = kmalloc(value_len, GFP_NOFS); if (!value) { err = -ENOMEM; goto out_error; } err = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, value_len); if (err <= 0) goto out_error; i.value = value; i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; err = ext4_xattr_ibody_set(handle, inode, &i, &is); if (err) goto out_error; } /* Clear the content within i_blocks. */ if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; memset(p + i_size, 0, EXT4_MIN_INLINE_DATA_SIZE - i_size); } EXT4_I(inode)->i_inline_size = i_size < EXT4_MIN_INLINE_DATA_SIZE ? EXT4_MIN_INLINE_DATA_SIZE : i_size; } out_error: up_write(&EXT4_I(inode)->i_data_sem); out: brelse(is.iloc.bh); ext4_write_unlock_xattr(inode, &no_expand); kfree(value); if (inode->i_nlink) ext4_orphan_del(handle, inode); if (err == 0) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); } ext4_journal_stop(handle); return err; } int ext4_convert_inline_data(struct inode *inode) { int error, needed_blocks, no_expand; handle_t *handle; struct ext4_iloc iloc; if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { /* * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is * cleared. This means we are in the middle of moving of * inline data to delay allocated block. Just force writeout * here to finish conversion. */ error = filemap_flush(inode->i_mapping); if (error) return error; if (!ext4_has_inline_data(inode)) return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); iloc.bh = NULL; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_free; } ext4_write_lock_xattr(inode, &no_expand); if (ext4_has_inline_data(inode)) error = ext4_convert_inline_data_nolock(handle, inode, &iloc); ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); out_free: brelse(iloc.bh); return error; }
24 38 164 3 3161 124 12 135 166 171 1618 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 /** * css_get - obtain a reference on the specified css * @css: target css * * The caller must already have a reference. */ CGROUP_REF_FN_ATTRS void css_get(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get(&css->refcnt); } CGROUP_REF_EXPORT(css_get) /** * css_get_many - obtain references on the specified css * @css: target css * @n: number of references to get * * The caller must already have a reference. */ CGROUP_REF_FN_ATTRS void css_get_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get_many(&css->refcnt, n); } CGROUP_REF_EXPORT(css_get_many) /** * css_tryget - try to obtain a reference on the specified css * @css: target css * * Obtain a reference on @css unless it already has reached zero and is * being released. This function doesn't care whether @css is on or * offline. The caller naturally needs to ensure that @css is accessible * but doesn't have to be holding a reference on it - IOW, RCU protected * access is good enough for this function. Returns %true if a reference * count was successfully obtained; %false otherwise. */ CGROUP_REF_FN_ATTRS bool css_tryget(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget(&css->refcnt); return true; } CGROUP_REF_EXPORT(css_tryget) /** * css_tryget_online - try to obtain a reference on the specified css if online * @css: target css * * Obtain a reference on @css if it's online. The caller naturally needs * to ensure that @css is accessible but doesn't have to be holding a * reference on it - IOW, RCU protected access is good enough for this * function. Returns %true if a reference count was successfully obtained; * %false otherwise. */ CGROUP_REF_FN_ATTRS bool css_tryget_online(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget_live(&css->refcnt); return true; } CGROUP_REF_EXPORT(css_tryget_online) /** * css_put - put a css reference * @css: target css * * Put a reference obtained via css_get() and css_tryget_online(). */ CGROUP_REF_FN_ATTRS void css_put(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put(&css->refcnt); } CGROUP_REF_EXPORT(css_put) /** * css_put_many - put css references * @css: target css * @n: number of references to put * * Put references obtained via css_get() and css_tryget_online(). */ CGROUP_REF_FN_ATTRS void css_put_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put_many(&css->refcnt, n); } CGROUP_REF_EXPORT(css_put_many)
13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_RING_H #define _LINUX_VIRTIO_RING_H #include <asm/barrier.h> #include <linux/irqreturn.h> #include <uapi/linux/virtio_ring.h> /* * Barriers in virtio are tricky. Non-SMP virtio guests can't assume * they're not on an SMP host system, so they need to assume real * barriers. Non-SMP virtio hosts could skip the barriers, but does * anyone care? * * For virtio_pci on SMP, we don't need to order with respect to MMIO * accesses through relaxed memory I/O windows, so virt_mb() et al are * sufficient. * * For using virtio to talk to real devices (eg. other heterogeneous * CPUs) we do need real barriers. In theory, we could be using both * kinds of virtio, so it's a runtime decision, and the branch is * actually quite cheap. */ static inline void virtio_mb(bool weak_barriers) { if (weak_barriers) virt_mb(); else mb(); } static inline void virtio_rmb(bool weak_barriers) { if (weak_barriers) virt_rmb(); else dma_rmb(); } static inline void virtio_wmb(bool weak_barriers) { if (weak_barriers) virt_wmb(); else dma_wmb(); } #define virtio_store_mb(weak_barriers, p, v) \ do { \ if (weak_barriers) { \ virt_store_mb(*p, v); \ } else { \ WRITE_ONCE(*p, v); \ mb(); \ } \ } while (0) \ struct virtio_device; struct virtqueue; struct device; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Creates a virtqueue and allocates the descriptor ring with per * virtqueue DMA device. */ struct virtqueue *vring_create_virtqueue_dma(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name, struct device *dma_dev); /* * Creates a virtqueue with a standard layout but a caller-allocated * ring. */ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool ctx, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Destroys a virtqueue. If created with vring_create_virtqueue, this * also frees the ring. */ void vring_del_virtqueue(struct virtqueue *vq); /* Filter out transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev); irqreturn_t vring_interrupt(int irq, void *_vq); u32 vring_notification_data(struct virtqueue *_vq); #endif /* _LINUX_VIRTIO_RING_H */
5 27 25 1 15 25 5 5 1 1 27 27 5 5 1 1 1 1 1 1 1 1 1 6 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/uverbs_ioctl.h> #include "rxe.h" #include "rxe_queue.h" #include "rxe_hw_counters.h" static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr); /* dev */ static int rxe_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (udata->inlen || udata->outlen) { rxe_dbg_dev(rxe, "malformed udata\n"); err = -EINVAL; goto err_out; } memcpy(attr, &rxe->attr, sizeof(*attr)); return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(ibdev); struct net_device *ndev; int err, ret; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } ndev = rxe_ib_device_get_netdev(ibdev); if (!ndev) { err = -ENODEV; goto err_out; } memcpy(attr, &rxe->port.attr, sizeof(*attr)); mutex_lock(&rxe->usdev_lock); ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed, &attr->active_width); attr->state = ib_get_curr_port_state(ndev); if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; else if (dev_get_flags(ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; mutex_unlock(&rxe->usdev_lock); dev_put(ndev); return ret; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_query_gid(struct ib_device *ibdev, u32 port, int idx, union ib_gid *gid) { struct rxe_dev *rxe = to_rdev(ibdev); /* subnet_prefix == interface_id == 0; */ memset(gid, 0, sizeof(*gid)); memcpy(gid->raw, rxe->raw_gid, ETH_ALEN); return 0; } static int rxe_query_pkey(struct ib_device *ibdev, u32 port_num, u16 index, u16 *pkey) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (index != 0) { err = -EINVAL; rxe_dbg_dev(rxe, "bad pkey index = %d\n", index); goto err_out; } *pkey = IB_DEFAULT_PKEY_FULL; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *attr) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | IB_DEVICE_MODIFY_NODE_DESC)) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported mask = 0x%x\n", mask); goto err_out; } if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid); if (mask & IB_DEVICE_MODIFY_NODE_DESC) { memcpy(rxe->ib_dev.node_desc, attr->node_desc, sizeof(rxe->ib_dev.node_desc)); } return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_port(struct ib_device *ibdev, u32 port_num, int mask, struct ib_port_modify *attr) { struct rxe_dev *rxe = to_rdev(ibdev); struct rxe_port *port; int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } //TODO is shutdown useful if (mask & ~(IB_PORT_RESET_QKEY_CNTR)) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported mask = 0x%x\n", mask); goto err_out; } port = &rxe->port; port->attr.port_cap_flags |= attr->set_port_cap_mask; port->attr.port_cap_flags &= ~attr->clr_port_cap_mask; if (mask & IB_PORT_RESET_QKEY_CNTR) port->attr.qkey_viol_cntr = 0; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static enum rdma_link_layer rxe_get_link_layer(struct ib_device *ibdev, u32 port_num) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } return IB_LINK_LAYER_ETHERNET; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { struct rxe_dev *rxe = to_rdev(ibdev); struct ib_port_attr attr = {}; int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } err = ib_query_port(ibdev, port_num, &attr); if (err) goto err_out; immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } /* uc */ static int rxe_alloc_ucontext(struct ib_ucontext *ibuc, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibuc->device); struct rxe_ucontext *uc = to_ruc(ibuc); int err; err = rxe_add_to_pool(&rxe->uc_pool, uc); if (err) rxe_err_dev(rxe, "unable to create uc\n"); return err; } static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) { struct rxe_ucontext *uc = to_ruc(ibuc); int err; err = rxe_cleanup(uc); if (err) rxe_err_uc(uc, "cleanup failed, err = %d\n", err); } /* pd */ static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); int err; err = rxe_add_to_pool(&rxe->pd_pool, pd); if (err) { rxe_dbg_dev(rxe, "unable to alloc pd\n"); goto err_out; } return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_pd *pd = to_rpd(ibpd); int err; err = rxe_cleanup(pd); if (err) rxe_err_pd(pd, "cleanup failed, err = %d\n", err); return 0; } /* ah */ static int rxe_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibah->device); struct rxe_ah *ah = to_rah(ibah); struct rxe_create_ah_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { /* test if new user provider */ if (udata->outlen >= sizeof(*uresp)) uresp = udata->outbuf; ah->is_user = true; } else { ah->is_user = false; } err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); if (err) { rxe_dbg_dev(rxe, "unable to create ah\n"); goto err_out; } /* create index > 0 */ ah->ah_num = ah->elem.index; err = rxe_ah_chk_attr(ah, init_attr->ah_attr); if (err) { rxe_dbg_ah(ah, "bad attr\n"); goto err_cleanup; } if (uresp) { /* only if new user provider */ err = copy_to_user(&uresp->ah_num, &ah->ah_num, sizeof(uresp->ah_num)); if (err) { err = -EFAULT; rxe_dbg_ah(ah, "unable to copy to user\n"); goto err_cleanup; } } else if (ah->is_user) { /* only if old user provider */ ah->ah_num = 0; } rxe_init_av(init_attr->ah_attr, &ah->av); rxe_finalize(ah); return 0; err_cleanup: cleanup_err = rxe_cleanup(ah); if (cleanup_err) rxe_err_ah(ah, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_ah(ah, "returned err = %d\n", err); return err; } static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { struct rxe_ah *ah = to_rah(ibah); int err; err = rxe_ah_chk_attr(ah, attr); if (err) { rxe_dbg_ah(ah, "bad attr\n"); goto err_out; } rxe_init_av(attr, &ah->av); return 0; err_out: rxe_err_ah(ah, "returned err = %d\n", err); return err; } static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { struct rxe_ah *ah = to_rah(ibah); memset(attr, 0, sizeof(*attr)); attr->type = ibah->type; rxe_av_to_attr(&ah->av, attr); return 0; } static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); int err; err = rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE); if (err) rxe_err_ah(ah, "cleanup failed, err = %d\n", err); return 0; } /* srq */ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibsrq->device); struct rxe_pd *pd = to_rpd(ibsrq->pd); struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_create_srq_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_err_dev(rxe, "malformed udata\n"); goto err_out; } uresp = udata->outbuf; } if (init->srq_type != IB_SRQT_BASIC) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "srq type = %d, not supported\n", init->srq_type); goto err_out; } err = rxe_srq_chk_init(rxe, init); if (err) { rxe_dbg_dev(rxe, "invalid init attributes\n"); goto err_out; } err = rxe_add_to_pool(&rxe->srq_pool, srq); if (err) { rxe_dbg_dev(rxe, "unable to create srq, err = %d\n", err); goto err_out; } rxe_get(pd); srq->pd = pd; err = rxe_srq_from_init(rxe, srq, init, udata, uresp); if (err) { rxe_dbg_srq(srq, "create srq failed, err = %d\n", err); goto err_cleanup; } return 0; err_cleanup: cleanup_err = rxe_cleanup(srq); if (cleanup_err) rxe_err_srq(srq, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_dev *rxe = to_rdev(ibsrq->device); struct rxe_modify_srq_cmd cmd = {}; int err; if (udata) { if (udata->inlen < sizeof(cmd)) { err = -EINVAL; rxe_dbg_srq(srq, "malformed udata\n"); goto err_out; } err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); if (err) { err = -EFAULT; rxe_dbg_srq(srq, "unable to read udata\n"); goto err_out; } } err = rxe_srq_chk_attr(rxe, srq, attr, mask); if (err) { rxe_dbg_srq(srq, "bad init attributes\n"); goto err_out; } err = rxe_srq_from_attr(rxe, srq, attr, mask, &cmd, udata); if (err) { rxe_dbg_srq(srq, "bad attr\n"); goto err_out; } return 0; err_out: rxe_err_srq(srq, "returned err = %d\n", err); return err; } static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) { struct rxe_srq *srq = to_rsrq(ibsrq); int err; if (srq->error) { err = -EINVAL; rxe_dbg_srq(srq, "srq in error state\n"); goto err_out; } attr->max_wr = srq->rq.queue->buf->index_mask; attr->max_sge = srq->rq.max_sge; attr->srq_limit = srq->limit; return 0; err_out: rxe_err_srq(srq, "returned err = %d\n", err); return err; } static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { int err = 0; struct rxe_srq *srq = to_rsrq(ibsrq); unsigned long flags; spin_lock_irqsave(&srq->rq.producer_lock, flags); while (wr) { err = post_one_recv(&srq->rq, wr); if (unlikely(err)) break; wr = wr->next; } spin_unlock_irqrestore(&srq->rq.producer_lock, flags); if (err) { *bad_wr = wr; rxe_err_srq(srq, "returned err = %d\n", err); } return err; } static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); int err; err = rxe_cleanup(srq); if (err) rxe_err_srq(srq, "cleanup failed, err = %d\n", err); return 0; } /* qp */ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_pd *pd = to_rpd(ibqp->pd); struct rxe_qp *qp = to_rqp(ibqp); struct rxe_create_qp_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->inlen) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } qp->is_user = true; uresp = udata->outbuf; } else { qp->is_user = false; } if (init->create_flags) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported create_flags, err = %d\n", err); goto err_out; } err = rxe_qp_chk_init(rxe, init); if (err) { rxe_dbg_dev(rxe, "bad init attr, err = %d\n", err); goto err_out; } err = rxe_add_to_pool(&rxe->qp_pool, qp); if (err) { rxe_dbg_dev(rxe, "unable to create qp, err = %d\n", err); goto err_out; } err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibqp->pd, udata); if (err) { rxe_dbg_qp(qp, "create qp failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(qp); return 0; err_cleanup: cleanup_err = rxe_cleanup(qp); if (cleanup_err) rxe_err_qp(qp, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_qp *qp = to_rqp(ibqp); int err; if (mask & ~IB_QP_ATTR_STANDARD_BITS) { err = -EOPNOTSUPP; rxe_dbg_qp(qp, "unsupported mask = 0x%x, err = %d\n", mask, err); goto err_out; } err = rxe_qp_chk_attr(rxe, qp, attr, mask); if (err) { rxe_dbg_qp(qp, "bad mask/attr, err = %d\n", err); goto err_out; } err = rxe_qp_from_attr(qp, attr, mask, udata); if (err) { rxe_dbg_qp(qp, "modify qp failed, err = %d\n", err); goto err_out; } if ((mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH)) qp->src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, qp->ibqp.qp_num, qp->attr.dest_qp_num); return 0; err_out: rxe_err_qp(qp, "returned err = %d\n", err); return err; } static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_qp_init_attr *init) { struct rxe_qp *qp = to_rqp(ibqp); rxe_qp_to_init(qp, init); rxe_qp_to_attr(qp, attr, mask); return 0; } static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct rxe_qp *qp = to_rqp(ibqp); int err; err = rxe_qp_chk_destroy(qp); if (err) { rxe_dbg_qp(qp, "unable to destroy qp, err = %d\n", err); goto err_out; } err = rxe_cleanup(qp); if (err) rxe_err_qp(qp, "cleanup failed, err = %d\n", err); return 0; err_out: rxe_err_qp(qp, "returned err = %d\n", err); return err; } /* send wr */ /* sanity check incoming send work request */ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int *maskp, unsigned int *lengthp) { int num_sge = ibwr->num_sge; struct rxe_sq *sq = &qp->sq; unsigned int mask = 0; unsigned long length = 0; int err = -EINVAL; int i; do { mask = wr_opcode_mask(ibwr->opcode, qp); if (!mask) { rxe_err_qp(qp, "bad wr opcode for qp type\n"); break; } if (num_sge > sq->max_sge) { rxe_err_qp(qp, "num_sge > max_sge\n"); break; } length = 0; for (i = 0; i < ibwr->num_sge; i++) length += ibwr->sg_list[i].length; if (length > RXE_PORT_MAX_MSG_SZ) { rxe_err_qp(qp, "message length too long\n"); break; } if (mask & WR_ATOMIC_MASK) { if (length != 8) { rxe_err_qp(qp, "atomic length != 8\n"); break; } if (atomic_wr(ibwr)->remote_addr & 0x7) { rxe_err_qp(qp, "misaligned atomic address\n"); break; } } if (ibwr->send_flags & IB_SEND_INLINE) { if (!(mask & WR_INLINE_MASK)) { rxe_err_qp(qp, "opcode doesn't support inline data\n"); break; } if (length > sq->max_inline) { rxe_err_qp(qp, "inline length too big\n"); break; } } err = 0; } while (0); *maskp = mask; *lengthp = (int)length; return err; } static int init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, const struct ib_send_wr *ibwr) { wr->wr_id = ibwr->wr_id; wr->opcode = ibwr->opcode; wr->send_flags = ibwr->send_flags; if (qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) { struct ib_ah *ibah = ud_wr(ibwr)->ah; wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn; wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey; wr->wr.ud.ah_num = to_rah(ibah)->ah_num; if (qp_type(qp) == IB_QPT_GSI) wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index; switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; break; case IB_WR_SEND: break; default: rxe_err_qp(qp, "bad wr opcode %d for UD/GSI QP\n", wr->opcode); return -EINVAL; } } else { switch (wr->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; fallthrough; case IB_WR_RDMA_READ: case IB_WR_RDMA_WRITE: wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; break; case IB_WR_SEND_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; break; case IB_WR_SEND_WITH_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; break; case IB_WR_RDMA_READ_WITH_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; break; case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: wr->wr.atomic.remote_addr = atomic_wr(ibwr)->remote_addr; wr->wr.atomic.compare_add = atomic_wr(ibwr)->compare_add; wr->wr.atomic.swap = atomic_wr(ibwr)->swap; wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey; break; case IB_WR_LOCAL_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; break; case IB_WR_REG_MR: wr->wr.reg.mr = reg_wr(ibwr)->mr; wr->wr.reg.key = reg_wr(ibwr)->key; wr->wr.reg.access = reg_wr(ibwr)->access; break; case IB_WR_SEND: case IB_WR_BIND_MW: case IB_WR_FLUSH: case IB_WR_ATOMIC_WRITE: break; default: rxe_err_qp(qp, "unsupported wr opcode %d\n", wr->opcode); return -EINVAL; } } return 0; } static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, const struct ib_send_wr *ibwr) { struct ib_sge *sge = ibwr->sg_list; u8 *p = wqe->dma.inline_data; int i; for (i = 0; i < ibwr->num_sge; i++, sge++) { memcpy(p, ib_virt_dma_to_ptr(sge->addr), sge->length); p += sge->length; } } static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int mask, unsigned int length, struct rxe_send_wqe *wqe) { int num_sge = ibwr->num_sge; int err; err = init_send_wr(qp, &wqe->wr, ibwr); if (err) return err; /* local operation */ if (unlikely(mask & WR_LOCAL_OP_MASK)) { wqe->mask = mask; wqe->state = wqe_state_posted; return 0; } if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) copy_inline_data_to_wqe(wqe, ibwr); else memcpy(wqe->dma.sge, ibwr->sg_list, num_sge * sizeof(struct ib_sge)); wqe->iova = mask & WR_ATOMIC_MASK ? atomic_wr(ibwr)->remote_addr : mask & WR_READ_OR_WRITE_MASK ? rdma_wr(ibwr)->remote_addr : 0; wqe->mask = mask; wqe->dma.length = length; wqe->dma.resid = length; wqe->dma.num_sge = num_sge; wqe->dma.cur_sge = 0; wqe->dma.sge_offset = 0; wqe->state = wqe_state_posted; wqe->ssn = atomic_add_return(1, &qp->ssn); return 0; } static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr) { int err; struct rxe_sq *sq = &qp->sq; struct rxe_send_wqe *send_wqe; unsigned int mask; unsigned int length; int full; err = validate_send_wr(qp, ibwr, &mask, &length); if (err) return err; full = queue_full(sq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { rxe_err_qp(qp, "send queue full\n"); return -ENOMEM; } send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_FROM_ULP); err = init_send_wqe(qp, ibwr, mask, length, send_wqe); if (!err) queue_advance_producer(sq->queue, QUEUE_TYPE_FROM_ULP); return err; } static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *ibwr, const struct ib_send_wr **bad_wr) { int err = 0; unsigned long flags; int good = 0; spin_lock_irqsave(&qp->sq.sq_lock, flags); while (ibwr) { err = post_one_send(qp, ibwr); if (err) { *bad_wr = ibwr; break; } else { good++; } ibwr = ibwr->next; } spin_unlock_irqrestore(&qp->sq.sq_lock, flags); /* kickoff processing of any posted wqes */ if (good) rxe_sched_task(&qp->send_task); return err; } static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { struct rxe_qp *qp = to_rqp(ibqp); int err; unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed\n"); return -EINVAL; } if (unlikely(qp_state(qp) < IB_QPS_RTS)) { spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_err_qp(qp, "qp not ready to send\n"); return -EINVAL; } spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->is_user) { /* Utilize process context to do protocol processing */ rxe_sched_task(&qp->send_task); } else { err = rxe_post_send_kernel(qp, wr, bad_wr); if (err) return err; } return 0; } /* recv wr */ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) { int i; unsigned long length; struct rxe_recv_wqe *recv_wqe; int num_sge = ibwr->num_sge; int full; int err; full = queue_full(rq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { err = -ENOMEM; rxe_dbg("queue full\n"); goto err_out; } if (unlikely(num_sge > rq->max_sge)) { err = -EINVAL; rxe_dbg("bad num_sge > max_sge\n"); goto err_out; } length = 0; for (i = 0; i < num_sge; i++) length += ibwr->sg_list[i].length; if (length > RXE_PORT_MAX_MSG_SZ) { err = -EINVAL; rxe_dbg("message length too long\n"); goto err_out; } recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_FROM_ULP); recv_wqe->wr_id = ibwr->wr_id; recv_wqe->dma.length = length; recv_wqe->dma.resid = length; recv_wqe->dma.num_sge = num_sge; recv_wqe->dma.cur_sge = 0; recv_wqe->dma.sge_offset = 0; memcpy(recv_wqe->dma.sge, ibwr->sg_list, num_sge * sizeof(struct ib_sge)); queue_advance_producer(rq->queue, QUEUE_TYPE_FROM_ULP); return 0; err_out: rxe_dbg("returned err = %d\n", err); return err; } static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { int err = 0; struct rxe_qp *qp = to_rqp(ibqp); struct rxe_rq *rq = &qp->rq; unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed\n"); return -EINVAL; } /* see C10-97.2.1 */ if (unlikely((qp_state(qp) < IB_QPS_INIT))) { spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_dbg_qp(qp, "qp not ready to post recv\n"); return -EINVAL; } spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(qp->srq)) { *bad_wr = wr; rxe_dbg_qp(qp, "qp has srq, use post_srq_recv instead\n"); return -EINVAL; } spin_lock_irqsave(&rq->producer_lock, flags); while (wr) { err = post_one_recv(rq, wr); if (unlikely(err)) { *bad_wr = wr; break; } wr = wr->next; } spin_unlock_irqrestore(&rq->producer_lock, flags); spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->recv_task); spin_unlock_irqrestore(&qp->state_lock, flags); return err; } /* cq */ static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs) { struct ib_udata *udata = &attrs->driver_udata; struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); struct rxe_cq *cq = to_rcq(ibcq); struct rxe_create_cq_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } uresp = udata->outbuf; } if (attr->flags) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "bad attr->flags, err = %d\n", err); goto err_out; } err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); if (err) { rxe_dbg_dev(rxe, "bad init attributes, err = %d\n", err); goto err_out; } err = rxe_add_to_pool(&rxe->cq_pool, cq); if (err) { rxe_dbg_dev(rxe, "unable to create cq, err = %d\n", err); goto err_out; } err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata, uresp); if (err) { rxe_dbg_cq(cq, "create cq failed, err = %d\n", err); goto err_cleanup; } return 0; err_cleanup: cleanup_err = rxe_cleanup(cq); if (cleanup_err) rxe_err_cq(cq, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); struct rxe_dev *rxe = to_rdev(ibcq->device); struct rxe_resize_cq_resp __user *uresp = NULL; int err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_cq(cq, "malformed udata\n"); goto err_out; } uresp = udata->outbuf; } err = rxe_cq_chk_attr(rxe, cq, cqe, 0); if (err) { rxe_dbg_cq(cq, "bad attr, err = %d\n", err); goto err_out; } err = rxe_cq_resize_queue(cq, cqe, uresp, udata); if (err) { rxe_dbg_cq(cq, "resize cq failed, err = %d\n", err); goto err_out; } return 0; err_out: rxe_err_cq(cq, "returned err = %d\n", err); return err; } static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { int i; struct rxe_cq *cq = to_rcq(ibcq); struct rxe_cqe *cqe; unsigned long flags; spin_lock_irqsave(&cq->cq_lock, flags); for (i = 0; i < num_entries; i++) { cqe = queue_head(cq->queue, QUEUE_TYPE_TO_ULP); if (!cqe) break; /* queue empty */ memcpy(wc++, &cqe->ibwc, sizeof(*wc)); queue_advance_consumer(cq->queue, QUEUE_TYPE_TO_ULP); } spin_unlock_irqrestore(&cq->cq_lock, flags); return i; } static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt) { struct rxe_cq *cq = to_rcq(ibcq); int count; count = queue_count(cq->queue, QUEUE_TYPE_TO_ULP); return (count > wc_cnt) ? wc_cnt : count; } static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct rxe_cq *cq = to_rcq(ibcq); int ret = 0; int empty; unsigned long irq_flags; spin_lock_irqsave(&cq->cq_lock, irq_flags); cq->notify |= flags & IB_CQ_SOLICITED_MASK; empty = queue_empty(cq->queue, QUEUE_TYPE_TO_ULP); if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty) ret = 1; spin_unlock_irqrestore(&cq->cq_lock, irq_flags); return ret; } static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); int err; /* See IBA C11-17: The CI shall return an error if this Verb is * invoked while a Work Queue is still associated with the CQ. */ if (atomic_read(&cq->num_wq)) { err = -EINVAL; rxe_dbg_cq(cq, "still in use\n"); goto err_out; } err = rxe_cleanup(cq); if (err) rxe_err_cq(cq, "cleanup failed, err = %d\n", err); return 0; err_out: rxe_err_cq(cq, "returned err = %d\n", err); return err; } /* mr */ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) { rxe_dbg_dev(rxe, "unable to create mr\n"); goto err_free; } rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; rxe_mr_init_dma(access, mr); rxe_finalize(mr); return &mr->ibmr; err_free: kfree(mr); rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, int access, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err, cleanup_err; if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_pd(pd, "access = %#x not supported (%#x)\n", access, RXE_ACCESS_SUPPORTED_MR); return ERR_PTR(-EOPNOTSUPP); } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) { rxe_dbg_pd(pd, "unable to create mr\n"); goto err_free; } rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; if (access & IB_ACCESS_ON_DEMAND) err = rxe_odp_mr_init_user(rxe, start, length, iova, access, mr); else err = rxe_mr_init_user(rxe, start, length, access, mr); if (err) { rxe_dbg_mr(mr, "reg_user_mr failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(mr); return &mr->ibmr; err_cleanup: cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", cleanup_err); err_free: kfree(mr); rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length, u64 iova, int access, struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_mr *mr = to_rmr(ibmr); struct rxe_pd *old_pd = to_rpd(ibmr->pd); struct rxe_pd *pd = to_rpd(ibpd); /* for now only support the two easy cases: * rereg_pd and rereg_access */ if (flags & ~RXE_MR_REREG_SUPPORTED) { rxe_err_mr(mr, "flags = %#x not supported\n", flags); return ERR_PTR(-EOPNOTSUPP); } if (flags & IB_MR_REREG_PD) { rxe_put(old_pd); rxe_get(pd); mr->ibmr.pd = ibpd; } if (flags & IB_MR_REREG_ACCESS) { if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_mr(mr, "access = %#x not supported\n", access); return ERR_PTR(-EOPNOTSUPP); } mr->access = access; } return NULL; } static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, u32 max_num_sg) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err, cleanup_err; if (mr_type != IB_MR_TYPE_MEM_REG) { err = -EINVAL; rxe_dbg_pd(pd, "mr type %d not supported, err = %d\n", mr_type, err); goto err_out; } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) goto err_free; rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; err = rxe_mr_init_fast(max_num_sg, mr); if (err) { rxe_dbg_mr(mr, "alloc_mr failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(mr); return &mr->ibmr; err_cleanup: cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", err); err_free: kfree(mr); err_out: rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct rxe_mr *mr = to_rmr(ibmr); int err, cleanup_err; /* See IBA 10.6.7.2.6 */ if (atomic_read(&mr->num_mw) > 0) { err = -EINVAL; rxe_dbg_mr(mr, "mr has mw's bound\n"); goto err_out; } cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", cleanup_err); kfree_rcu_mightsleep(mr); return 0; err_out: rxe_err_mr(mr, "returned err = %d\n", err); return err; } static ssize_t parent_show(struct device *device, struct device_attribute *attr, char *buf) { struct rxe_dev *rxe = rdma_device_to_drv_device(device, struct rxe_dev, ib_dev); return sysfs_emit(buf, "%s\n", rxe_parent_name(rxe, 1)); } static DEVICE_ATTR_RO(parent); static struct attribute *rxe_dev_attributes[] = { &dev_attr_parent.attr, NULL }; static const struct attribute_group rxe_attr_group = { .attrs = rxe_dev_attributes, }; static int rxe_enable_driver(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); struct net_device *ndev; ndev = rxe_ib_device_get_netdev(ib_dev); if (!ndev) return -ENODEV; rxe_set_port_state(rxe); dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev)); dev_put(ndev); return 0; } static const struct ib_device_ops rxe_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_RXE, .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, .alloc_hw_port_stats = rxe_ib_alloc_hw_port_stats, .alloc_mr = rxe_alloc_mr, .alloc_mw = rxe_alloc_mw, .alloc_pd = rxe_alloc_pd, .alloc_ucontext = rxe_alloc_ucontext, .attach_mcast = rxe_attach_mcast, .create_ah = rxe_create_ah, .create_cq = rxe_create_cq, .create_qp = rxe_create_qp, .create_srq = rxe_create_srq, .create_user_ah = rxe_create_ah, .dealloc_driver = rxe_dealloc, .dealloc_mw = rxe_dealloc_mw, .dealloc_pd = rxe_dealloc_pd, .dealloc_ucontext = rxe_dealloc_ucontext, .dereg_mr = rxe_dereg_mr, .destroy_ah = rxe_destroy_ah, .destroy_cq = rxe_destroy_cq, .destroy_qp = rxe_destroy_qp, .destroy_srq = rxe_destroy_srq, .detach_mcast = rxe_detach_mcast, .device_group = &rxe_attr_group, .enable_driver = rxe_enable_driver, .get_dma_mr = rxe_get_dma_mr, .get_hw_stats = rxe_ib_get_hw_stats, .get_link_layer = rxe_get_link_layer, .get_port_immutable = rxe_port_immutable, .map_mr_sg = rxe_map_mr_sg, .mmap = rxe_mmap, .modify_ah = rxe_modify_ah, .modify_device = rxe_modify_device, .modify_port = rxe_modify_port, .modify_qp = rxe_modify_qp, .modify_srq = rxe_modify_srq, .peek_cq = rxe_peek_cq, .poll_cq = rxe_poll_cq, .post_recv = rxe_post_recv, .post_send = rxe_post_send, .post_srq_recv = rxe_post_srq_recv, .query_ah = rxe_query_ah, .query_device = rxe_query_device, .query_pkey = rxe_query_pkey, .query_gid = rxe_query_gid, .query_port = rxe_query_port, .query_qp = rxe_query_qp, .query_srq = rxe_query_srq, .reg_user_mr = rxe_reg_user_mr, .req_notify_cq = rxe_req_notify_cq, .rereg_user_mr = rxe_rereg_user_mr, .resize_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_qp, rxe_qp, ibqp), INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc), INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), }; int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, struct net_device *ndev) { int err; struct ib_device *dev = &rxe->ib_dev; strscpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); dev->node_type = RDMA_NODE_IB_CA; dev->phys_port_cnt = 1; dev->num_comp_vectors = num_possible_cpus(); dev->local_dma_lkey = 0; addrconf_addr_eui48((unsigned char *)&dev->node_guid, rxe->raw_gid); dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); ib_set_device_ops(dev, &rxe_dev_ops); err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1); if (err) return err; err = ib_register_device(dev, ibdev_name, NULL); if (err) rxe_dbg_dev(rxe, "failed with error %d\n", err); /* * Note that rxe may be invalid at this point if another thread * unregistered it. */ return err; }
1268 108 491 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM timestamp #if !defined(_TRACE_TIMESTAMP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TIMESTAMP_H #include <linux/tracepoint.h> #include <linux/fs.h> #define CTIME_QUERIED_FLAGS \ { I_CTIME_QUERIED, "Q" } DECLARE_EVENT_CLASS(ctime, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(time64_t, ctime_s) __field(u32, ctime_ns) __field(u32, gen) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->ctime_s = ctime->tv_sec; __entry->ctime_ns = ctime->tv_nsec; ), TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->ctime_s, __entry->ctime_ns ) ); DEFINE_EVENT(ctime, inode_set_ctime_to_ts, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime)); DEFINE_EVENT(ctime, ctime_xchg_skip, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime)); TRACE_EVENT(ctime_ns_xchg, TP_PROTO(struct inode *inode, u32 old, u32 new, u32 cur), TP_ARGS(inode, old, new, cur), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(u32, gen) __field(u32, old) __field(u32, new) __field(u32, cur) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->old = old; __entry->new = new; __entry->cur = cur; ), TP_printk("ino=%d:%d:%ld:%u old=%u:%s new=%u cur=%u:%s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->old & ~I_CTIME_QUERIED, __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS), __entry->new, __entry->cur & ~I_CTIME_QUERIED, __print_flags(__entry->cur & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS) ) ); TRACE_EVENT(fill_mg_cmtime, TP_PROTO(struct inode *inode, struct timespec64 *ctime, struct timespec64 *mtime), TP_ARGS(inode, ctime, mtime), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(time64_t, ctime_s) __field(time64_t, mtime_s) __field(u32, ctime_ns) __field(u32, mtime_ns) __field(u32, gen) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->ctime_s = ctime->tv_sec; __entry->mtime_s = mtime->tv_sec; __entry->ctime_ns = ctime->tv_nsec; __entry->mtime_ns = mtime->tv_nsec; ), TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u mtime=%lld.%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->ctime_s, __entry->ctime_ns, __entry->mtime_s, __entry->mtime_ns ) ); #endif /* _TRACE_TIMESTAMP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
9 9 9 9 9 9 3 6 11 11 1 10 6 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> */ #include <linux/filter.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/bpf.h> #include <net/lwtunnel.h> #include <net/gre.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/ipv6_stubs.h> #include <net/inet_dscp.h> struct bpf_lwt_prog { struct bpf_prog *prog; char *name; }; struct bpf_lwt { struct bpf_lwt_prog in; struct bpf_lwt_prog out; struct bpf_lwt_prog xmit; int family; }; #define MAX_PROG_NAME 256 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) { return (struct bpf_lwt *)lwt->data; } #define NO_REDIRECT false #define CAN_REDIRECT true static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, struct dst_entry *dst, bool can_redirect) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int ret; /* Disabling BH is needed to protect per-CPU bpf_redirect_info between * BPF prog and skb_do_redirect(). */ local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); switch (ret) { case BPF_OK: case BPF_LWT_REROUTE: break; case BPF_REDIRECT: if (unlikely(!can_redirect)) { pr_warn_once("Illegal redirect return code in prog %s\n", lwt->name ? : "<unknown>"); ret = BPF_OK; } else { skb_reset_mac_header(skb); skb_do_redirect(skb); ret = BPF_REDIRECT; } break; case BPF_DROP: kfree_skb(skb); ret = -EPERM; break; default: pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); kfree_skb(skb); ret = -EINVAL; break; } bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); return ret; } static int bpf_lwt_input_reroute(struct sk_buff *skb) { enum skb_drop_reason reason; int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { struct net_device *dev = skb_dst(skb)->dev; const struct iphdr *iph = ip_hdr(skb); dev_hold(dev); skb_dst_drop(skb); reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); err = reason ? -EINVAL : 0; dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); err = ipv6_stub->ipv6_route_input(skb); } else { err = -EAFNOSUPPORT; } if (err) goto err; return dst_input(skb); err: kfree_skb(skb); return err; } static int bpf_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct bpf_lwt *bpf; int ret; bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->in.prog) { ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); if (ret < 0) return ret; if (ret == BPF_LWT_REROUTE) return bpf_lwt_input_reroute(skb); } if (unlikely(!dst->lwtstate->orig_input)) { kfree_skb(skb); return -EINVAL; } return dst->lwtstate->orig_input(skb); } static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct bpf_lwt *bpf; int ret; bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->out.prog) { ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); if (ret < 0) return ret; } if (unlikely(!dst->lwtstate->orig_output)) { pr_warn_once("orig_output not set on dst for prog %s\n", bpf->out.name); kfree_skb(skb); return -EINVAL; } return dst->lwtstate->orig_output(net, sk, skb); } static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) return -ENOMEM; } return 0; } static int bpf_lwt_xmit_reroute(struct sk_buff *skb) { struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); int oif = l3mdev ? l3mdev->ifindex : 0; struct dst_entry *dst = NULL; int err = -EAFNOSUPPORT; struct sock *sk; struct net *net; bool ipv4; if (skb->protocol == htons(ETH_P_IP)) ipv4 = true; else if (skb->protocol == htons(ETH_P_IPV6)) ipv4 = false; else goto err; sk = sk_to_full_sk(skb->sk); if (sk) { if (sk->sk_bound_dev_if) oif = sk->sk_bound_dev_if; net = sock_net(sk); } else { net = dev_net(skb_dst(skb)->dev); } if (ipv4) { struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = {}; struct rtable *rt; fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; fl4.saddr = iph->saddr; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto err; } dst = &rt->dst; } else { struct ipv6hdr *iph6 = ipv6_hdr(skb); struct flowi6 fl6 = {}; fl6.flowi6_oif = oif; fl6.flowi6_mark = skb->mark; fl6.flowi6_uid = sock_net_uid(net, sk); fl6.flowlabel = ip6_flowinfo(iph6); fl6.flowi6_proto = iph6->nexthdr; fl6.daddr = iph6->daddr; fl6.saddr = iph6->saddr; dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err; } } if (unlikely(dst->error)) { err = dst->error; dst_release(dst); goto err; } /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it * was done for the previous dst, so we are doing it here again, in * case the new dst needs much more space. The call below is a noop * if there is enough header space in skb. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) goto err; skb_dst_drop(skb); skb_dst_set(skb, dst); err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(err)) return net_xmit_errno(err); /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ return LWTUNNEL_XMIT_DONE; err: kfree_skb(skb); return err; } static int bpf_xmit(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct bpf_lwt *bpf; bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->xmit.prog) { int hh_len = dst->dev->hard_header_len; __be16 proto = skb->protocol; int ret; ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); switch (ret) { case BPF_OK: /* If the header changed, e.g. via bpf_lwt_push_encap, * BPF_LWT_REROUTE below should have been used if the * protocol was also changed. */ if (skb->protocol != proto) { kfree_skb(skb); return -EINVAL; } /* If the header was expanded, headroom might be too * small for L2 header to come, expand as needed. */ ret = xmit_check_hhlen(skb, hh_len); if (unlikely(ret)) return ret; return LWTUNNEL_XMIT_CONTINUE; case BPF_REDIRECT: return LWTUNNEL_XMIT_DONE; case BPF_LWT_REROUTE: return bpf_lwt_xmit_reroute(skb); default: return ret; } } return LWTUNNEL_XMIT_CONTINUE; } static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) { if (prog->prog) bpf_prog_put(prog->prog); kfree(prog->name); } static void bpf_destroy_state(struct lwtunnel_state *lwt) { struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); bpf_lwt_prog_destroy(&bpf->in); bpf_lwt_prog_destroy(&bpf->out); bpf_lwt_prog_destroy(&bpf->xmit); } static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, .len = MAX_PROG_NAME }, }; static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, enum bpf_prog_type type) { struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; struct bpf_prog *p; int ret; u32 fd; ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, NULL); if (ret < 0) return ret; if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) return -EINVAL; prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); if (!prog->name) return -ENOMEM; fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); p = bpf_prog_get_type(fd, type); if (IS_ERR(p)) return PTR_ERR(p); prog->prog = p; return 0; } static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { [LWT_BPF_IN] = { .type = NLA_NESTED, }, [LWT_BPF_OUT] = { .type = NLA_NESTED, }, [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, }; static int bpf_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_BPF_MAX + 1]; struct lwtunnel_state *newts; struct bpf_lwt *bpf; int ret; if (family != AF_INET && family != AF_INET6) return -EAFNOSUPPORT; ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); if (ret < 0) return ret; if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) return -EINVAL; newts = lwtunnel_state_alloc(sizeof(*bpf)); if (!newts) return -ENOMEM; newts->type = LWTUNNEL_ENCAP_BPF; bpf = bpf_lwt_lwtunnel(newts); if (tb[LWT_BPF_IN]) { newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, BPF_PROG_TYPE_LWT_IN); if (ret < 0) goto errout; } if (tb[LWT_BPF_OUT]) { newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, BPF_PROG_TYPE_LWT_OUT); if (ret < 0) goto errout; } if (tb[LWT_BPF_XMIT]) { newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, BPF_PROG_TYPE_LWT_XMIT); if (ret < 0) goto errout; } if (tb[LWT_BPF_XMIT_HEADROOM]) { u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); if (headroom > LWT_BPF_MAX_HEADROOM) { ret = -ERANGE; goto errout; } newts->headroom = headroom; } bpf->family = family; *ts = newts; return 0; errout: bpf_destroy_state(newts); kfree(newts); return ret; } static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, struct bpf_lwt_prog *prog) { struct nlattr *nest; if (!prog->prog) return 0; nest = nla_nest_start_noflag(skb, attr); if (!nest) return -EMSGSIZE; if (prog->name && nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) return -EMSGSIZE; return nla_nest_end(skb, nest); } static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) return -EMSGSIZE; return 0; } static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) { int nest_len = nla_total_size(sizeof(struct nlattr)) + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 0; return nest_len + /* LWT_BPF_IN */ nest_len + /* LWT_BPF_OUT */ nest_len + /* LWT_BPF_XMIT */ 0; } static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) { /* FIXME: * The LWT state is currently rebuilt for delete requests which * results in a new bpf_prog instance. Comparing names for now. */ if (!a->name && !b->name) return 0; if (!a->name || !b->name) return 1; return strcmp(a->name, b->name); } static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); } static const struct lwtunnel_encap_ops bpf_encap_ops = { .build_state = bpf_build_state, .destroy_state = bpf_destroy_state, .input = bpf_input, .output = bpf_output, .xmit = bpf_xmit, .fill_encap = bpf_fill_encap_info, .get_encap_size = bpf_encap_nlsize, .cmp_encap = bpf_encap_cmp, .owner = THIS_MODULE, }; static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, int encap_len) { struct skb_shared_info *shinfo = skb_shinfo(skb); gso_type |= SKB_GSO_DODGY; shinfo->gso_type |= gso_type; skb_decrease_gso_size(shinfo, encap_len); shinfo->gso_segs = 0; return 0; } static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) { int next_hdr_offset; void *next_hdr; __u8 protocol; /* SCTP and UDP_L4 gso need more nuanced handling than what * handle_gso_type() does above: skb_decrease_gso_size() is not enough. * So at the moment only TCP GSO packets are let through. */ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) return -ENOTSUPP; if (ipv4) { protocol = ip_hdr(skb)->protocol; next_hdr_offset = sizeof(struct iphdr); next_hdr = skb_network_header(skb) + next_hdr_offset; } else { protocol = ipv6_hdr(skb)->nexthdr; next_hdr_offset = sizeof(struct ipv6hdr); next_hdr = skb_network_header(skb) + next_hdr_offset; } switch (protocol) { case IPPROTO_GRE: next_hdr_offset += sizeof(struct gre_base_hdr); if (next_hdr_offset > encap_len) return -EINVAL; if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) return handle_gso_type(skb, SKB_GSO_GRE_CSUM, encap_len); return handle_gso_type(skb, SKB_GSO_GRE, encap_len); case IPPROTO_UDP: next_hdr_offset += sizeof(struct udphdr); if (next_hdr_offset > encap_len) return -EINVAL; if (((struct udphdr *)next_hdr)->check) return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, encap_len); return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); case IPPROTO_IP: case IPPROTO_IPV6: if (ipv4) return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); else return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); default: return -EPROTONOSUPPORT; } } int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { struct iphdr *iph; bool ipv4; int err; if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) return -EINVAL; /* validate protocol and length */ iph = (struct iphdr *)hdr; if (iph->version == 4) { ipv4 = true; if (unlikely(len < iph->ihl * 4)) return -EINVAL; } else if (iph->version == 6) { ipv4 = false; if (unlikely(len < sizeof(struct ipv6hdr))) return -EINVAL; } else { return -EINVAL; } if (ingress) err = skb_cow_head(skb, len + skb->mac_len); else err = skb_cow_head(skb, len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); if (unlikely(err)) return err; /* push the encap headers and fix pointers */ skb_reset_inner_headers(skb); skb_reset_inner_mac_header(skb); /* mac header is not yet set */ skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_push(skb, len); if (ingress) skb_postpush_rcsum(skb, iph, len); skb_reset_network_header(skb); memcpy(skb_network_header(skb), hdr, len); bpf_compute_data_pointers(skb); skb_clear_hash(skb); if (ipv4) { skb->protocol = htons(ETH_P_IP); iph = ip_hdr(skb); if (!iph->check) iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } else { skb->protocol = htons(ETH_P_IPV6); } if (skb_is_gso(skb)) return handle_gso_encap(skb, ipv4, len); return 0; } static int __init bpf_lwt_init(void) { return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); } subsys_initcall(bpf_lwt_init)
8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 // SPDX-License-Identifier: GPL-2.0-only /* * xt_conntrack - Netfilter module to match connection tracking * information. (Superset of Rusty's minimalistic state match.) * * (C) 2001 Marc Boucher (marc@mbsi.ca). * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_conntrack.h> #include <net/netfilter/nf_conntrack.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: connection tracking state match"); MODULE_ALIAS("ipt_conntrack"); MODULE_ALIAS("ip6t_conntrack"); static bool conntrack_addrcmp(const union nf_inet_addr *kaddr, const union nf_inet_addr *uaddr, const union nf_inet_addr *umask, unsigned int l3proto) { if (l3proto == NFPROTO_IPV4) return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; else if (l3proto == NFPROTO_IPV6) return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, &uaddr->in6) == 0; else return false; } static inline bool conntrack_mt_origsrc(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, &info->origsrc_addr, &info->origsrc_mask, family); } static inline bool conntrack_mt_origdst(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3, &info->origdst_addr, &info->origdst_mask, family); } static inline bool conntrack_mt_replsrc(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3, &info->replsrc_addr, &info->replsrc_mask, family); } static inline bool conntrack_mt_repldst(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3, &info->repldst_addr, &info->repldst_mask, family); } static inline bool ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info, const struct nf_conn *ct) { const struct nf_conntrack_tuple *tuple; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; if ((info->match_flags & XT_CONNTRACK_PROTO) && (nf_ct_protonum(ct) == info->l4proto) ^ !(info->invert_flags & XT_CONNTRACK_PROTO)) return false; /* Shortcut to match all recognized protocols by using ->src.all. */ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && (tuple->src.u.all == info->origsrc_port) ^ !(info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && (tuple->dst.u.all == info->origdst_port) ^ !(info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) return false; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && (tuple->src.u.all == info->replsrc_port) ^ !(info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && (tuple->dst.u.all == info->repldst_port) ^ !(info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) return false; return true; } static inline bool port_match(u16 min, u16 max, u16 port, bool invert) { return (port >= min && port <= max) ^ invert; } static inline bool ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info, const struct nf_conn *ct) { const struct nf_conntrack_tuple *tuple; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; if ((info->match_flags & XT_CONNTRACK_PROTO) && (nf_ct_protonum(ct) == info->l4proto) ^ !(info->invert_flags & XT_CONNTRACK_PROTO)) return false; /* Shortcut to match all recognized protocols by using ->src.all. */ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && !port_match(info->origsrc_port, info->origsrc_port_high, ntohs(tuple->src.u.all), info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && !port_match(info->origdst_port, info->origdst_port_high, ntohs(tuple->dst.u.all), info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) return false; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && !port_match(info->replsrc_port, info->replsrc_port_high, ntohs(tuple->src.u.all), info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && !port_match(info->repldst_port, info->repldst_port_high, ntohs(tuple->dst.u.all), info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) return false; return true; } static bool conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 state_mask, u16 status_mask) { const struct xt_conntrack_mtinfo2 *info = par->matchinfo; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int statebit; ct = nf_ct_get(skb, &ctinfo); if (ct) statebit = XT_CONNTRACK_STATE_BIT(ctinfo); else if (ctinfo == IP_CT_UNTRACKED) statebit = XT_CONNTRACK_STATE_UNTRACKED; else statebit = XT_CONNTRACK_STATE_INVALID; if (info->match_flags & XT_CONNTRACK_STATE) { if (ct != NULL) { if (test_bit(IPS_SRC_NAT_BIT, &ct->status)) statebit |= XT_CONNTRACK_STATE_SNAT; if (test_bit(IPS_DST_NAT_BIT, &ct->status)) statebit |= XT_CONNTRACK_STATE_DNAT; } if (!!(state_mask & statebit) ^ !(info->invert_flags & XT_CONNTRACK_STATE)) return false; } if (ct == NULL) return info->match_flags & XT_CONNTRACK_STATE; if ((info->match_flags & XT_CONNTRACK_DIRECTION) && (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^ !(info->invert_flags & XT_CONNTRACK_DIRECTION)) return false; if (info->match_flags & XT_CONNTRACK_ORIGSRC) if (conntrack_mt_origsrc(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) return false; if (info->match_flags & XT_CONNTRACK_ORIGDST) if (conntrack_mt_origdst(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_ORIGDST)) return false; if (info->match_flags & XT_CONNTRACK_REPLSRC) if (conntrack_mt_replsrc(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_REPLSRC)) return false; if (info->match_flags & XT_CONNTRACK_REPLDST) if (conntrack_mt_repldst(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_REPLDST)) return false; if (par->match->revision != 3) { if (!ct_proto_port_check(info, ct)) return false; } else { if (!ct_proto_port_check_v3(par->matchinfo, ct)) return false; } if ((info->match_flags & XT_CONNTRACK_STATUS) && (!!(status_mask & ct->status) ^ !(info->invert_flags & XT_CONNTRACK_STATUS))) return false; if (info->match_flags & XT_CONNTRACK_EXPIRES) { unsigned long expires = nf_ct_expires(ct) / HZ; if ((expires >= info->expires_min && expires <= info->expires_max) ^ !(info->invert_flags & XT_CONNTRACK_EXPIRES)) return false; } return true; } static bool conntrack_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_conntrack_mtinfo1 *info = par->matchinfo; return conntrack_mt(skb, par, info->state_mask, info->status_mask); } static bool conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_conntrack_mtinfo2 *info = par->matchinfo; return conntrack_mt(skb, par, info->state_mask, info->status_mask); } static bool conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_conntrack_mtinfo3 *info = par->matchinfo; return conntrack_mt(skb, par, info->state_mask, info->status_mask); } static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", .revision = 1, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo1), .match = conntrack_mt_v1, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, { .name = "conntrack", .revision = 2, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo2), .match = conntrack_mt_v2, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, { .name = "conntrack", .revision = 3, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo3), .match = conntrack_mt_v3, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, }; static int __init conntrack_mt_init(void) { return xt_register_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg)); } static void __exit conntrack_mt_exit(void) { xt_unregister_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg)); } module_init(conntrack_mt_init); module_exit(conntrack_mt_exit);
28 28 26 11 11 1 2 2 1 1 4 4 4 4 4 4 7 7 1 6 4 1 3 3 1 1 3 1 1 15 14 2 4 22 1 1 17 3 17 5 5 5 4 1 6 3 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2018 Facebook */ #include <linux/bpf.h> #include <linux/err.h> #include <linux/sock_diag.h> #include <net/sock_reuseport.h> #include <linux/btf_ids.h> struct reuseport_array { struct bpf_map map; struct sock __rcu *ptrs[]; }; static struct reuseport_array *reuseport_array(struct bpf_map *map) { return (struct reuseport_array *)map; } /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { struct sock __rcu **socks; write_lock_bh(&sk->sk_callback_lock); socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); if (socks) { WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of * sk->sk_callback_lock because there is * a race with reuseport_array_free() * which does not hold the reuseport_lock. */ RCU_INIT_POINTER(*socks, NULL); } write_unlock_bh(&sk->sk_callback_lock); } static int reuseport_array_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) return -EINVAL; return array_map_alloc_check(attr); } static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return rcu_dereference(array->ptrs[index]); } /* Called from syscall only */ static long reuseport_array_delete_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; struct sock *sk; int err; if (index >= map->max_entries) return -E2BIG; if (!rcu_access_pointer(array->ptrs[index])) return -ENOENT; spin_lock_bh(&reuseport_lock); sk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); if (sk) { write_lock_bh(&sk->sk_callback_lock); WRITE_ONCE(sk->sk_user_data, NULL); RCU_INIT_POINTER(array->ptrs[index], NULL); write_unlock_bh(&sk->sk_callback_lock); err = 0; } else { err = -ENOENT; } spin_unlock_bh(&reuseport_lock); return err; } static void reuseport_array_free(struct bpf_map *map) { struct reuseport_array *array = reuseport_array(map); struct sock *sk; u32 i; /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with * bpf_sk_reuseport_detach() which was triggered by * close() or disconnect(). * * This function and bpf_sk_reuseport_detach() are * both removing sk from "array". Who removes it * first does not matter. * * The only concern here is bpf_sk_reuseport_detach() * may access "array" which is being freed here. * bpf_sk_reuseport_detach() access this "array" * through sk->sk_user_data _and_ with sk->sk_callback_lock * held which is enough because this "array" is not freed * until all sk->sk_user_data has stopped referencing this "array". * * Hence, due to the above, taking "reuseport_lock" is not * needed here. */ /* * Since reuseport_lock is not taken, sk is accessed under * rcu_read_lock() */ rcu_read_lock(); for (i = 0; i < map->max_entries; i++) { sk = rcu_dereference(array->ptrs[i]); if (sk) { write_lock_bh(&sk->sk_callback_lock); /* * No need for WRITE_ONCE(). At this point, * no one is reading it without taking the * sk->sk_callback_lock. */ sk->sk_user_data = NULL; write_unlock_bh(&sk->sk_callback_lock); RCU_INIT_POINTER(array->ptrs[i], NULL); } } rcu_read_unlock(); /* * Once reaching here, all sk->sk_user_data is not * referencing this "array". "array" can be freed now. */ bpf_map_area_free(array); } static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); return &array->map; } int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { struct sock *sk; int err; if (map->value_size != sizeof(u64)) return -ENOSPC; rcu_read_lock(); sk = reuseport_array_lookup_elem(map, key); if (sk) { *(u64 *)value = __sock_gen_cookie(sk); err = 0; } else { err = -ENOENT; } rcu_read_unlock(); return err; } static int reuseport_array_update_check(const struct reuseport_array *array, const struct sock *nsk, const struct sock *osk, const struct sock_reuseport *nsk_reuse, u32 map_flags) { if (osk && map_flags == BPF_NOEXIST) return -EEXIST; if (!osk && map_flags == BPF_EXIST) return -ENOENT; if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) return -ENOTSUPP; if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) return -ENOTSUPP; if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) return -ENOTSUPP; /* * sk must be hashed (i.e. listening in the TCP case or binded * in the UDP case) and * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). * * Also, sk will be used in bpf helper that is protected by * rcu_read_lock(). */ if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) return -EINVAL; /* READ_ONCE because the sk->sk_callback_lock may not be held here */ if (READ_ONCE(nsk->sk_user_data)) return -EBUSY; return 0; } /* * Called from syscall only. * The "nsk" in the fd refcnt. * The "osk" and "reuse" are protected by reuseport_lock. */ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct reuseport_array *array = reuseport_array(map); struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; uintptr_t sk_user_data; struct socket *socket; int err, fd; if (map_flags > BPF_EXIST) return -EINVAL; if (index >= map->max_entries) return -E2BIG; if (map->value_size == sizeof(u64)) { u64 fd64 = *(u64 *)value; if (fd64 > S32_MAX) return -EINVAL; fd = fd64; } else { fd = *(int *)value; } socket = sockfd_lookup(fd, &err); if (!socket) return err; nsk = socket->sk; if (!nsk) { err = -EINVAL; goto put_file; } /* Quick checks before taking reuseport_lock */ err = reuseport_array_update_check(array, nsk, rcu_access_pointer(array->ptrs[index]), rcu_access_pointer(nsk->sk_reuseport_cb), map_flags); if (err) goto put_file; spin_lock_bh(&reuseport_lock); /* * Some of the checks only need reuseport_lock * but it is done under sk_callback_lock also * for simplicity reason. */ write_lock_bh(&nsk->sk_callback_lock); osk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); if (err) goto put_file_unlock; sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF; WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; put_file_unlock: write_unlock_bh(&nsk->sk_callback_lock); if (free_osk) { write_lock_bh(&free_osk->sk_callback_lock); WRITE_ONCE(free_osk->sk_user_data, NULL); write_unlock_bh(&free_osk->sk_callback_lock); } spin_unlock_bh(&reuseport_lock); put_file: sockfd_put(socket); return err; } /* Called from syscall */ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct reuseport_array *array = reuseport_array(map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { *next = 0; return 0; } if (index == array->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } static u64 reuseport_array_mem_usage(const struct bpf_map *map) { struct reuseport_array *array; return struct_size(array, ptrs, map->max_entries); } BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, .map_free = reuseport_array_free, .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, .map_mem_usage = reuseport_array_mem_usage, .map_btf_id = &reuseport_array_map_btf_ids[0], };
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio PCI driver - common functionality for all device versions * * This module allows virtio devices to be used over a virtual PCI device. * This can be used with QEMU based VMMs like KVM or Xen. * * Copyright IBM Corp. 2007 * Copyright Red Hat, Inc. 2014 * * Authors: * Anthony Liguori <aliguori@us.ibm.com> * Rusty Russell <rusty@rustcorp.com.au> * Michael S. Tsirkin <mst@redhat.com> */ #include "virtio_pci_common.h" static bool force_legacy = false; #if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) module_param(force_legacy, bool, 0444); MODULE_PARM_DESC(force_legacy, "Force legacy mode for transitional virtio 1 devices"); #endif bool vp_is_avq(struct virtio_device *vdev, unsigned int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) return false; return index == vp_dev->admin_vq.vq_index; } /* wait for pending irq handlers */ void vp_synchronize_vectors(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); int i; if (vp_dev->intx_enabled) synchronize_irq(vp_dev->pci_dev->irq); for (i = 0; i < vp_dev->msix_vectors; ++i) synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i)); } /* the notify function used when creating a virt queue */ bool vp_notify(struct virtqueue *vq) { /* we write the queue's selector into the notification register to * signal the other end */ iowrite16(vq->index, (void __iomem *)vq->priv); return true; } /* Notify all slow path virtqueues on an interrupt. */ static void vp_vring_slow_path_interrupt(int irq, struct virtio_pci_device *vp_dev) { struct virtio_pci_vq_info *info; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->slow_virtqueues, node) vring_interrupt(irq, info->vq); spin_unlock_irqrestore(&vp_dev->lock, flags); } /* Handle a configuration change: Tell driver if it wants to know. */ static irqreturn_t vp_config_changed(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; virtio_config_changed(&vp_dev->vdev); vp_vring_slow_path_interrupt(irq, vp_dev); return IRQ_HANDLED; } /* Notify all virtqueues on an interrupt. */ static irqreturn_t vp_vring_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; struct virtio_pci_vq_info *info; irqreturn_t ret = IRQ_NONE; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->virtqueues, node) { if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) ret = IRQ_HANDLED; } spin_unlock_irqrestore(&vp_dev->lock, flags); return ret; } /* A small wrapper to also acknowledge the interrupt when it's handled. * I really need an EIO hook for the vring so I can ack the interrupt once we * know that we'll be handling the IRQ but before we invoke the callback since * the callback may notify the host which results in the host attempting to * raise an interrupt that we would then mask once we acknowledged the * interrupt. */ static irqreturn_t vp_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; u8 isr; /* reading the ISR has the effect of also clearing it so it's very * important to save off the value. */ isr = ioread8(vp_dev->isr); /* It's definitely not us if the ISR was not high */ if (!isr) return IRQ_NONE; /* Configuration change? Tell driver if it wants to know. */ if (isr & VIRTIO_PCI_ISR_CONFIG) vp_config_changed(irq, opaque); return vp_vring_interrupt(irq, opaque); } static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, bool per_vq_vectors, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned int flags = PCI_IRQ_MSIX; unsigned int i, v; int err = -ENOMEM; vp_dev->msix_vectors = nvectors; vp_dev->msix_names = kmalloc_array(nvectors, sizeof(*vp_dev->msix_names), GFP_KERNEL); if (!vp_dev->msix_names) goto error; vp_dev->msix_affinity_masks = kcalloc(nvectors, sizeof(*vp_dev->msix_affinity_masks), GFP_KERNEL); if (!vp_dev->msix_affinity_masks) goto error; for (i = 0; i < nvectors; ++i) if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i], GFP_KERNEL)) goto error; if (!per_vq_vectors) desc = NULL; if (desc) { flags |= PCI_IRQ_AFFINITY; desc->pre_vectors++; /* virtio config vector */ } err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors, nvectors, flags, desc); if (err < 0) goto error; vp_dev->msix_enabled = 1; /* Set the vector used for configuration */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-config", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_config_changed, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; v = vp_dev->config_vector(vp_dev, v); /* Verify we had enough resources to assign the vector */ if (v == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto error; } if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-virtqueues", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; } return 0; error: return err; } static bool vp_is_slow_path_vector(u16 msix_vec) { return msix_vec == VP_MSIX_CONFIG_VECTOR; } static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx, u16 msix_vec, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL); struct virtqueue *vq; unsigned long flags; /* fill out our structure that represents an active queue */ if (!info) return ERR_PTR(-ENOMEM); vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, msix_vec); if (IS_ERR(vq)) goto out_info; info->vq = vq; if (callback) { spin_lock_irqsave(&vp_dev->lock, flags); if (!vp_is_slow_path_vector(msix_vec)) list_add(&info->node, &vp_dev->virtqueues); else list_add(&info->node, &vp_dev->slow_virtqueues); spin_unlock_irqrestore(&vp_dev->lock, flags); } else { INIT_LIST_HEAD(&info->node); } *p_info = info; return vq; out_info: kfree(info); return vq; } static void vp_del_vq(struct virtqueue *vq, struct virtio_pci_vq_info *info) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); unsigned long flags; /* * If it fails during re-enable reset vq. This way we won't rejoin * info->node to the queue. Prevent unexpected irqs. */ if (!vq->reset) { spin_lock_irqsave(&vp_dev->lock, flags); list_del(&info->node); spin_unlock_irqrestore(&vp_dev->lock, flags); } vp_dev->del_vq(info); kfree(info); } /* the config->del_vqs() implementation */ void vp_del_vqs(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq, *n; int i; list_for_each_entry_safe(vq, n, &vdev->vqs, list) { info = vp_is_avq(vdev, vq->index) ? vp_dev->admin_vq.info : vp_dev->vqs[vq->index]; if (vp_dev->per_vq_vectors) { int v = info->msix_vector; if (v != VIRTIO_MSI_NO_VECTOR && !vp_is_slow_path_vector(v)) { int irq = pci_irq_vector(vp_dev->pci_dev, v); irq_update_affinity_hint(irq, NULL); free_irq(irq, vq); } } vp_del_vq(vq, info); } vp_dev->per_vq_vectors = false; if (vp_dev->intx_enabled) { free_irq(vp_dev->pci_dev->irq, vp_dev); vp_dev->intx_enabled = 0; } for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev); if (vp_dev->msix_affinity_masks) { for (i = 0; i < vp_dev->msix_vectors; i++) free_cpumask_var(vp_dev->msix_affinity_masks[i]); } if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR); pci_free_irq_vectors(vp_dev->pci_dev); vp_dev->msix_enabled = 0; } vp_dev->msix_vectors = 0; vp_dev->msix_used_vectors = 0; kfree(vp_dev->msix_names); vp_dev->msix_names = NULL; kfree(vp_dev->msix_affinity_masks); vp_dev->msix_affinity_masks = NULL; kfree(vp_dev->vqs); vp_dev->vqs = NULL; } enum vp_vq_vector_policy { VP_VQ_VECTOR_POLICY_EACH, VP_VQ_VECTOR_POLICY_SHARED_SLOW, VP_VQ_VECTOR_POLICY_SHARED, }; static struct virtqueue * vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, const char *name, bool ctx, bool slow_path, int *allocated_vectors, enum vp_vq_vector_policy vector_policy, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; u16 msix_vec; int err; if (!callback) msix_vec = VIRTIO_MSI_NO_VECTOR; else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH || (vector_policy == VP_VQ_VECTOR_POLICY_SHARED_SLOW && !slow_path)) msix_vec = (*allocated_vectors)++; else if (vector_policy != VP_VQ_VECTOR_POLICY_EACH && slow_path) msix_vec = VP_MSIX_CONFIG_VECTOR; else msix_vec = VP_MSIX_VQ_VECTOR; vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec, p_info); if (IS_ERR(vq)) return vq; if (vector_policy == VP_VQ_VECTOR_POLICY_SHARED || msix_vec == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(msix_vec)) return vq; /* allocate per-vq irq if available and necessary */ snprintf(vp_dev->msix_names[msix_vec], sizeof(*vp_dev->msix_names), "%s-%s", dev_name(&vp_dev->vdev.dev), name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupt, 0, vp_dev->msix_names[msix_vec], vq); if (err) { vp_del_vq(vq, *p_info); return ERR_PTR(err); } return vq; } static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], enum vp_vq_vector_policy vector_policy, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; struct virtqueue_info *vqi; int i, err, nvectors, allocated_vectors, queue_idx = 0; struct virtqueue *vq; bool per_vq_vectors; u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto error_find; } per_vq_vectors = vector_policy != VP_VQ_VECTOR_POLICY_SHARED; if (per_vq_vectors) { /* Best option: one for change interrupt, one per vq. */ nvectors = 1; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (vqi->name && vqi->callback) ++nvectors; } if (avq_num && vector_policy == VP_VQ_VECTOR_POLICY_EACH) ++nvectors; } else { /* Second best: one for change, shared for all vqs. */ nvectors = 2; } err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, desc); if (err) goto error_find; vp_dev->per_vq_vectors = per_vq_vectors; allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, false, &allocated_vectors, vector_policy, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_find_one_vq_msix(vdev, avq->vq_index, vp_modern_avq_done, avq->name, false, true, &allocated_vectors, vector_policy, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto error_find; } return 0; error_find: vp_del_vqs(vdev); return err; } static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[]) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; int i, err, queue_idx = 0; struct virtqueue *vq; u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto out_del_vqs; } err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vp_dev); if (err) goto out_del_vqs; vp_dev->intx_enabled = 1; vp_dev->per_vq_vectors = false; for (i = 0; i < nvqs; ++i) { struct virtqueue_info *vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_setup_vq(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, VIRTIO_MSI_NO_VECTOR, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto out_del_vqs; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_setup_vq(vdev, queue_idx++, vp_modern_avq_done, avq->name, false, VIRTIO_MSI_NO_VECTOR, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto out_del_vqs; } return 0; out_del_vqs: vp_del_vqs(vdev); return err; } /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], struct irq_affinity *desc) { int err; /* Try MSI-X with one vector per queue. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_EACH, desc); if (!err) return 0; /* Fallback: MSI-X with one shared vector for config and * slow path queues, one vector per queue for the rest. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED_SLOW, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED, desc); if (!err) return 0; /* Is there an interrupt? If not give up. */ if (!(to_vp_device(vdev)->pci_dev->irq)) return err; /* Finally fall back to regular interrupts. */ return vp_find_vqs_intx(vdev, nvqs, vqs, vqs_info); } const char *vp_bus_name(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); return pci_name(vp_dev->pci_dev); } /* Setup the affinity for a virtqueue: * - force the affinity for per vq vector * - OR over all affinities for shared MSI * - ignore the affinity request if we're using INTX */ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) { struct virtio_device *vdev = vq->vdev; struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; struct cpumask *mask; unsigned int irq; if (!vq->callback) return -EINVAL; if (vp_dev->msix_enabled) { mask = vp_dev->msix_affinity_masks[info->msix_vector]; irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); if (!cpu_mask) irq_update_affinity_hint(irq, NULL); else { cpumask_copy(mask, cpu_mask); irq_set_affinity_and_hint(irq, mask); } } return 0; } const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!vp_dev->per_vq_vectors || vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(vp_dev->vqs[index]->msix_vector)) return NULL; return pci_irq_get_affinity(vp_dev->pci_dev, vp_dev->vqs[index]->msix_vector); } #ifdef CONFIG_PM_SLEEP static int virtio_pci_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = virtio_device_freeze(&vp_dev->vdev); if (!ret) pci_disable_device(pci_dev); return ret; } static int virtio_pci_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = pci_enable_device(pci_dev); if (ret) return ret; pci_set_master(pci_dev); return virtio_device_restore(&vp_dev->vdev); } static bool vp_supports_pm_no_reset(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); u16 pmcsr; if (!pci_dev->pm_cap) return false; pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (PCI_POSSIBLE_ERROR(pmcsr)) { dev_err(dev, "Unable to query pmcsr"); return false; } return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET; } static int virtio_pci_suspend(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev); } static int virtio_pci_resume(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev); } static const struct dev_pm_ops virtio_pci_pm_ops = { .suspend = virtio_pci_suspend, .resume = virtio_pci_resume, .freeze = virtio_pci_freeze, .thaw = virtio_pci_restore, .poweroff = virtio_pci_freeze, .restore = virtio_pci_restore, }; #endif /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ static const struct pci_device_id virtio_pci_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, { 0 } }; MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); static void virtio_pci_release_dev(struct device *_d) { struct virtio_device *vdev = dev_to_virtio(_d); struct virtio_pci_device *vp_dev = to_vp_device(vdev); /* As struct device is a kobject, it's not safe to * free the memory (including the reference counter itself) * until it's release callback. */ kfree(vp_dev); } static int virtio_pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) { struct virtio_pci_device *vp_dev, *reg_dev = NULL; int rc; /* allocate our structure and fill it out */ vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); if (!vp_dev) return -ENOMEM; pci_set_drvdata(pci_dev, vp_dev); vp_dev->vdev.dev.parent = &pci_dev->dev; vp_dev->vdev.dev.release = virtio_pci_release_dev; vp_dev->pci_dev = pci_dev; INIT_LIST_HEAD(&vp_dev->virtqueues); INIT_LIST_HEAD(&vp_dev->slow_virtqueues); spin_lock_init(&vp_dev->lock); /* enable the device */ rc = pci_enable_device(pci_dev); if (rc) goto err_enable_device; if (force_legacy) { rc = virtio_pci_legacy_probe(vp_dev); /* Also try modern mode if we can't map BAR0 (no IO space). */ if (rc == -ENODEV || rc == -ENOMEM) rc = virtio_pci_modern_probe(vp_dev); if (rc) goto err_probe; } else { rc = virtio_pci_modern_probe(vp_dev); if (rc == -ENODEV) rc = virtio_pci_legacy_probe(vp_dev); if (rc) goto err_probe; } pci_set_master(pci_dev); rc = register_virtio_device(&vp_dev->vdev); reg_dev = vp_dev; if (rc) goto err_register; return 0; err_register: if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); err_probe: pci_disable_device(pci_dev); err_enable_device: if (reg_dev) put_device(&vp_dev->vdev.dev); else kfree(vp_dev); return rc; } static void virtio_pci_remove(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct device *dev = get_device(&vp_dev->vdev.dev); /* * Device is marked broken on surprise removal so that virtio upper * layers can abort any ongoing operation. */ if (!pci_device_is_present(pci_dev)) virtio_break_device(&vp_dev->vdev); pci_disable_sriov(pci_dev); unregister_virtio_device(&vp_dev->vdev); if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); pci_disable_device(pci_dev); put_device(dev); } static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct virtio_device *vdev = &vp_dev->vdev; int ret; if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) return -EBUSY; if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV)) return -EINVAL; if (pci_vfs_assigned(pci_dev)) return -EPERM; if (num_vfs == 0) { pci_disable_sriov(pci_dev); return 0; } ret = pci_enable_sriov(pci_dev, num_vfs); if (ret < 0) return ret; return num_vfs; } static void virtio_pci_reset_prepare(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret = 0; ret = virtio_device_reset_prepare(&vp_dev->vdev); if (ret) { if (ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset prepare failure: %d", ret); return; } if (pci_is_enabled(pci_dev)) pci_disable_device(pci_dev); } static void virtio_pci_reset_done(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; if (pci_is_enabled(pci_dev)) return; ret = pci_enable_device(pci_dev); if (!ret) { pci_set_master(pci_dev); ret = virtio_device_reset_done(&vp_dev->vdev); } if (ret && ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset done failure: %d", ret); } static const struct pci_error_handlers virtio_pci_err_handler = { .reset_prepare = virtio_pci_reset_prepare, .reset_done = virtio_pci_reset_done, }; static struct pci_driver virtio_pci_driver = { .name = "virtio-pci", .id_table = virtio_pci_id_table, .probe = virtio_pci_probe, .remove = virtio_pci_remove, #ifdef CONFIG_PM_SLEEP .driver.pm = &virtio_pci_pm_ops, #endif .sriov_configure = virtio_pci_sriov_configure, .err_handler = &virtio_pci_err_handler, }; struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev) { struct virtio_pci_device *pf_vp_dev; pf_vp_dev = pci_iov_get_pf_drvdata(pdev, &virtio_pci_driver); if (IS_ERR(pf_vp_dev)) return NULL; return &pf_vp_dev->vdev; } module_pci_driver(virtio_pci_driver); MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); MODULE_DESCRIPTION("virtio-pci"); MODULE_LICENSE("GPL"); MODULE_VERSION("1");
21780 21789 13023 13014 1657 160 1657 194 1654 1657 22061 22012 1657 1656 194 1656 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 // SPDX-License-Identifier: GPL-2.0 /* * trace context switch * * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> * */ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/kmemleak.h> #include <linux/ftrace.h> #include <trace/events/sched.h> #include "trace.h" #define RECORD_CMDLINE 1 #define RECORD_TGID 2 static int sched_cmdline_ref; static int sched_tgid_ref; static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { int flags; flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); if (!flags) return; tracing_record_taskinfo_sched_switch(prev, next, flags); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee) { int flags; flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); if (!flags) return; tracing_record_taskinfo_sched_switch(current, wakee, flags); } static int tracing_sched_register(void) { int ret; ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } ret = register_trace_sched_switch(probe_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } return ret; fail_deprobe_wake_new: unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); fail_deprobe: unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); return ret; } static void tracing_sched_unregister(void) { unregister_trace_sched_switch(probe_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } static void tracing_start_sched_switch(int ops) { bool sched_register; mutex_lock(&sched_register_mutex); sched_register = (!sched_cmdline_ref && !sched_tgid_ref); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref++; break; case RECORD_TGID: sched_tgid_ref++; break; } if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) tracing_sched_register(); mutex_unlock(&sched_register_mutex); } static void tracing_stop_sched_switch(int ops) { mutex_lock(&sched_register_mutex); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref--; break; case RECORD_TGID: sched_tgid_ref--; break; } if (!sched_cmdline_ref && !sched_tgid_ref) tracing_sched_unregister(); mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) { tracing_start_sched_switch(RECORD_CMDLINE); } void tracing_stop_cmdline_record(void) { tracing_stop_sched_switch(RECORD_CMDLINE); } void tracing_start_tgid_record(void) { tracing_start_sched_switch(RECORD_TGID); } void tracing_stop_tgid_record(void) { tracing_stop_sched_switch(RECORD_TGID); } /* * The tgid_map array maps from pid to tgid; i.e. the value stored at index i * is the tgid last observed corresponding to pid=i. */ static int *tgid_map; /* The maximum valid index into tgid_map. */ static size_t tgid_map_max; #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX /* * Preemption must be disabled before acquiring trace_cmdline_lock. * The various trace_arrays' max_lock must be acquired in a context * where interrupt is disabled. */ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; unsigned *map_cmdline_to_pid; unsigned cmdline_num; int cmdline_idx; char saved_cmdlines[]; }; static struct saved_cmdlines_buffer *savedcmd; /* Holds the size of a cmdline and pid element */ #define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \ (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0])) static inline char *get_saved_cmdlines(int idx) { return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; } static inline void set_cmdline(int idx, const char *cmdline) { strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); kmemleak_free(s); free_pages((unsigned long)s, order); } static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) { struct saved_cmdlines_buffer *s; struct page *page; int orig_size, size; int order; /* Figure out how much is needed to hold the given number of cmdlines */ orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); order = get_order(orig_size); size = 1 << (order + PAGE_SHIFT); page = alloc_pages(GFP_KERNEL, order); if (!page) return NULL; s = page_address(page); kmemleak_alloc(s, size, 1, GFP_KERNEL); memset(s, 0, sizeof(*s)); /* Round up to actual allocation */ val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); s->cmdline_num = val; /* Place map_cmdline_to_pid array right after saved_cmdlines */ s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; s->cmdline_idx = 0; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); return s; } int trace_create_savedcmd(void) { savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); return savedcmd ? 0 : -ENOMEM; } int trace_save_cmdline(struct task_struct *tsk) { unsigned tpid, idx; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* * It's not the end of the world if we don't get * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. * * This is called within the scheduler and wake up, so interrupts * had better been disabled and run queue lock been held. */ lockdep_assert_preemption_disabled(); if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; idx = savedcmd->map_pid_to_cmdline[tpid]; if (idx == NO_CMDLINE_MAP) { idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; savedcmd->map_pid_to_cmdline[tpid] = idx; savedcmd->cmdline_idx = idx; } savedcmd->map_cmdline_to_pid[idx] = tsk->pid; set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); return 1; } static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; int tpid; if (!pid) { strcpy(comm, "<idle>"); return; } if (WARN_ON_ONCE(pid < 0)) { strcpy(comm, "<XXX>"); return; } tpid = pid & (PID_MAX_DEFAULT - 1); map = savedcmd->map_pid_to_cmdline[tpid]; if (map != NO_CMDLINE_MAP) { tpid = savedcmd->map_cmdline_to_pid[map]; if (tpid == pid) { strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); return; } } strcpy(comm, "<...>"); } void trace_find_cmdline(int pid, char comm[]) { preempt_disable(); arch_spin_lock(&trace_cmdline_lock); __trace_find_cmdline(pid, comm); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } static int *trace_find_tgid_ptr(int pid) { /* * Pairs with the smp_store_release in set_tracer_flag() to ensure that * if we observe a non-NULL tgid_map then we also observe the correct * tgid_map_max. */ int *map = smp_load_acquire(&tgid_map); if (unlikely(!map || pid > tgid_map_max)) return NULL; return &map[pid]; } int trace_find_tgid(int pid) { int *ptr = trace_find_tgid_ptr(pid); return ptr ? *ptr : 0; } static int trace_save_tgid(struct task_struct *tsk) { int *ptr; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; ptr = trace_find_tgid_ptr(tsk->pid); if (!ptr) return 0; *ptr = tsk->tgid; return 1; } static bool tracing_record_taskinfo_skip(int flags) { if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) return true; if (!__this_cpu_read(trace_taskinfo_save)) return true; return false; } /** * tracing_record_taskinfo - record the task info of a task * * @task: task to record * @flags: TRACE_RECORD_CMDLINE for recording comm * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo(struct task_struct *task, int flags) { bool done; if (tracing_record_taskinfo_skip(flags)) return; /* * Record as much task information as possible. If some fail, continue * to try to record the others. */ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); /* If recording any information failed, retry again soon. */ if (!done) return; __this_cpu_write(trace_taskinfo_save, false); } /** * tracing_record_taskinfo_sched_switch - record task info for sched_switch * * @prev: previous task during sched_switch * @next: next task during sched_switch * @flags: TRACE_RECORD_CMDLINE for recording comm * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) { bool done; if (tracing_record_taskinfo_skip(flags)) return; /* * Record as much task information as possible. If some fail, continue * to try to record the others. */ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); /* If recording any information failed, retry again soon. */ if (!done) return; __this_cpu_write(trace_taskinfo_save, false); } /* Helpers to record a specific task information */ void tracing_record_cmdline(struct task_struct *task) { tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); } void tracing_record_tgid(struct task_struct *task) { tracing_record_taskinfo(task, TRACE_RECORD_TGID); } int trace_alloc_tgid_map(void) { int *map; if (tgid_map) return 0; tgid_map_max = init_pid_ns.pid_max; map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), GFP_KERNEL); if (!map) return -ENOMEM; /* * Pairs with smp_load_acquire() in * trace_find_tgid_ptr() to ensure that if it observes * the tgid_map we just allocated then it also observes * the corresponding tgid_map_max value. */ smp_store_release(&tgid_map, map); return 0; } static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) { int pid = ++(*pos); return trace_find_tgid_ptr(pid); } static void *saved_tgids_start(struct seq_file *m, loff_t *pos) { int pid = *pos; return trace_find_tgid_ptr(pid); } static void saved_tgids_stop(struct seq_file *m, void *v) { } static int saved_tgids_show(struct seq_file *m, void *v) { int *entry = (int *)v; int pid = entry - tgid_map; int tgid = *entry; if (tgid == 0) return SEQ_SKIP; seq_printf(m, "%d %d\n", pid, tgid); return 0; } static const struct seq_operations tracing_saved_tgids_seq_ops = { .start = saved_tgids_start, .stop = saved_tgids_stop, .next = saved_tgids_next, .show = saved_tgids_show, }; static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; return seq_open(filp, &tracing_saved_tgids_seq_ops); } const struct file_operations tracing_saved_tgids_fops = { .open = tracing_saved_tgids_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { unsigned int *ptr = v; if (*pos || m->count) ptr++; (*pos)++; for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; ptr++) { if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) continue; return ptr; } return NULL; } static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) { void *v; loff_t l = 0; preempt_disable(); arch_spin_lock(&trace_cmdline_lock); v = &savedcmd->map_cmdline_to_pid[0]; while (l <= *pos) { v = saved_cmdlines_next(m, v, &l); if (!v) return NULL; } return v; } static void saved_cmdlines_stop(struct seq_file *m, void *v) { arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } static int saved_cmdlines_show(struct seq_file *m, void *v) { char buf[TASK_COMM_LEN]; unsigned int *pid = v; __trace_find_cmdline(*pid, buf); seq_printf(m, "%d %s\n", *pid, buf); return 0; } static const struct seq_operations tracing_saved_cmdlines_seq_ops = { .start = saved_cmdlines_start, .next = saved_cmdlines_next, .stop = saved_cmdlines_stop, .show = saved_cmdlines_show, }; static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; return seq_open(filp, &tracing_saved_cmdlines_seq_ops); } const struct file_operations tracing_saved_cmdlines_fops = { .open = tracing_saved_cmdlines_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static ssize_t tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; int r; preempt_disable(); arch_spin_lock(&trace_cmdline_lock); r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } void trace_free_saved_cmdlines_buffer(void) { free_saved_cmdlines_buffer(savedcmd); } static int tracing_resize_saved_cmdlines(unsigned int val) { struct saved_cmdlines_buffer *s, *savedcmd_temp; s = allocate_cmdlines_buffer(val); if (!s) return -ENOMEM; preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; } static ssize_t tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; /* must have at least 1 entry or less than PID_MAX_DEFAULT */ if (!val || val > PID_MAX_DEFAULT) return -EINVAL; ret = tracing_resize_saved_cmdlines((unsigned int)val); if (ret < 0) return ret; *ppos += cnt; return cnt; } const struct file_operations tracing_saved_cmdlines_size_fops = { .open = tracing_open_generic, .read = tracing_saved_cmdlines_size_read, .write = tracing_saved_cmdlines_size_write, };
31 193 38 38 37 419 37 8 36 38 14 16 2 30 30 30 49 72 72 31 26 3 1 7 17 72 72 1 1 72 71 72 22 49 18 18 116 154 154 27 124 2 125 15 116 21 90 13 116 104 14 94 7 7 11 102 8 8 1 1 93 112 12 100 9 91 99 110 2 30 82 10 23 77 176 31 31 31 31 31 31 31 5 22 4 31 30 31 31 30 30 31 31 13 18 18 1 1 1 19 20 26 35 1 34 17 1 16 6 21 1 1 1 21 2 1 20 19 20 4 36 36 1 22 8 1 32 1 33 21 3 43 43 42 43 43 36 30 28 1 28 1 1 8 2 6 72 73 70 2 13 1 47 1 5 5 62 1 1 1 1 43 10 8 2 1 3 4 3 3 8 14 3 57 57 57 56 40 16 1 1 1 57 93 1 9 37 24 24 36 36 96 96 96 8 1 1 7 6 7 65 22 1 32 36 46 3 22 1 22 2 19 263 263 263 263 1 49 1 143 1 33 24 45 158 38 1 85 163 8 1 106 1 19 123 5 2 74 51 2 36 136 42 118 105 7 52 40 13 6 118 4 121 87 37 122 117 5 121 53 2 106 81 18 78 16 8 8 8 19 17 1 26 140 140 115 68 19 8 27 95 95 45 49 3 43 53 376 62 316 44 4 40 93 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 // SPDX-License-Identifier: GPL-2.0-or-later /* * UDP over IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/ipv4/udp.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data * YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file. */ #include <linux/bpf-cgroup.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/indirect_call_wrapper.h> #include <trace/events/udp.h> #include <net/addrconf.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/raw.h> #include <net/seg6.h> #include <net/tcp_states.h> #include <net/ip6_checksum.h> #include <net/ip6_tunnel.h> #include <net/udp_tunnel.h> #include <net/xfrm.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/busy_poll.h> #include <net/sock_reuseport.h> #include <net/gro.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <trace/events/skb.h> #include "udp_impl.h" static void udpv6_destruct_sock(struct sock *sk) { udp_destruct_common(sk); inet6_sock_destruct(sk); } int udpv6_init_sock(struct sock *sk) { udp_lib_init_sock(sk); sk->sk_destruct = udpv6_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); return 0; } INDIRECT_CALLABLE_SCOPE u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { u32 lhash, fhash; net_get_random_once(&udp6_ehash_secret, sizeof(udp6_ehash_secret)); net_get_random_once(&udp_ipv6_hash_secret, sizeof(udp_ipv6_hash_secret)); lhash = (__force u32)laddr->s6_addr32[3]; fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret); return __inet6_ehashfn(lhash, lport, fhash, fport, udp6_ehash_secret + net_hash_mix(net)); } int udp_v6_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum); unsigned int hash2_partial = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; return udp_lib_get_port(sk, snum, hash2_nulladdr); } void udp_v6_rehash(struct sock *sk) { u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); u16 new_hash4; if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { new_hash4 = udp_ehashfn(sock_net(sk), sk->sk_rcv_saddr, sk->sk_num, sk->sk_daddr, sk->sk_dport); } else { new_hash4 = udp6_ehashfn(sock_net(sk), &sk->sk_v6_rcv_saddr, sk->sk_num, &sk->sk_v6_daddr, sk->sk_dport); } udp_lib_rehash(sk, new_hash, new_hash4); } static int compute_score(struct sock *sk, const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, int dif, int sdif) { int bound_dev_if, score; struct inet_sock *inet; bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || sk->sk_family != PF_INET6) return -1; if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; score = 0; inet = inet_sk(sk); if (inet->inet_dport) { if (inet->inet_dport != sport) return -1; score++; } if (!ipv6_addr_any(&sk->sk_v6_daddr)) { if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) return -1; score++; } bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); dev_match = udp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif); if (!dev_match) return -1; if (bound_dev_if) score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; return score; } /** * udp6_lib_lookup1() - Simplified lookup using primary hash (destination port) * @net: Network namespace * @saddr: Source address, network order * @sport: Source port, network order * @daddr: Destination address, network order * @hnum: Destination port, host order * @dif: Destination interface index * @sdif: Destination bridge port index, if relevant * @udptable: Set of UDP hash tables * * Simplified lookup to be used as fallback if no sockets are found due to a * potential race between (receive) address change, and lookup happening before * the rehash operation. This function ignores SO_REUSEPORT groups while scoring * result sockets, because if we have one, we don't need the fallback at all. * * Called under rcu_read_lock(). * * Return: socket with highest matching score if any, NULL if none */ static struct sock *udp6_lib_lookup1(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, int sdif, const struct udp_table *udptable) { unsigned int slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot = &udptable->hash[slot]; struct sock *sk, *result = NULL; int score, badness = 0; sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { result = sk; badness = score; } } return result; } /* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; int score, badness; bool need_rescore; result = NULL; badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { need_rescore = false; rescore: score = compute_score(need_rescore ? result : sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; if (need_rescore) continue; if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; } result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), saddr, sport, daddr, hnum, udp6_ehashfn); if (!result) { result = sk; continue; } /* Fall back to scoring if group has connections */ if (!reuseport_has_conns(sk)) return result; /* Reuseport logic returned an error, keep original score. */ if (IS_ERR(result)) continue; /* compute_score is too long of a function to be * inlined, and calling it again here yields * measureable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'. */ need_rescore = true; goto rescore; } } return result; } #if IS_ENABLED(CONFIG_BASE_SMALL) static struct sock *udp6_lib_lookup4(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, int sdif, struct udp_table *udptable) { return NULL; } static void udp6_hash4(struct sock *sk) { } #else /* !CONFIG_BASE_SMALL */ static struct sock *udp6_lib_lookup4(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, int sdif, struct udp_table *udptable) { const __portpair ports = INET_COMBINED_PORTS(sport, hnum); const struct hlist_nulls_node *node; struct udp_hslot *hslot4; unsigned int hash4, slot; struct udp_sock *up; struct sock *sk; hash4 = udp6_ehashfn(net, daddr, hnum, saddr, sport); slot = hash4 & udptable->mask; hslot4 = &udptable->hash4[slot]; begin: udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { sk = (struct sock *)up; if (inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) return sk; } /* if the nulls value we got at the end of this lookup is not the * expected one, we must restart lookup. We probably met an item that * was moved to another chain due to rehash. */ if (get_nulls_value(node) != slot) goto begin; return NULL; } static void udp6_hash4(struct sock *sk) { struct net *net = sock_net(sk); unsigned int hash; if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { udp4_hash4(sk); return; } if (sk_unhashed(sk) || ipv6_addr_any(&sk->sk_v6_rcv_saddr)) return; hash = udp6_ehashfn(net, &sk->sk_v6_rcv_saddr, sk->sk_num, &sk->sk_v6_daddr, sk->sk_dport); udp_lib_hash4(sk, hash); } #endif /* CONFIG_BASE_SMALL */ /* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); struct udp_hslot *hslot2; struct sock *result, *sk; unsigned int hash2; hash2 = ipv6_portaddr_hash(net, daddr, hnum); hslot2 = udp_hashslot2(udptable, hash2); if (udp_has_hash4(hslot2)) { result = udp6_lib_lookup4(net, saddr, sport, daddr, hnum, dif, sdif, udptable); if (result) /* udp6_lib_lookup4 return sk or NULL */ return result; } /* Lookup connected or non-wildcard sockets */ result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, hslot2, skb); if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) goto done; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) && udptable == net->ipv4.udp_table) { sk = inet6_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), saddr, sport, daddr, hnum, dif, udp6_ehashfn); if (sk) { result = sk; goto done; } } /* Got non-wildcard socket or error on first lookup */ if (result) goto done; /* Lookup wildcard sockets */ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); hslot2 = udp_hashslot2(udptable, hash2); result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, hslot2, skb); if (!IS_ERR_OR_NULL(result)) goto done; /* Cover address change/lookup/rehash race: see __udp4_lib_lookup() */ result = udp6_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif, udptable); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(__udp6_lib_lookup); static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct udp_table *udptable) { const struct ipv6hdr *iph = ipv6_hdr(skb); return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), inet6_sdif(skb), udptable, skb); } struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); struct net *net = dev_net(skb->dev); int iif, sdif; inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, &iph->daddr, dport, iif, sdif, net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6) struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { struct sock *sk; sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, 0, net->ipv4.udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } EXPORT_SYMBOL_GPL(udp6_lib_lookup); #endif /* do not use the scratch area len for jumbogram: their length execeeds the * scratch area space; note that the IP6CB flags is still in the first * cacheline, so checking for jumbograms is cheap */ static int udp6_skb_len(struct sk_buff *skb) { return unlikely(inet6_is_jumbogram(skb)) ? skb->len : udp_skb_len(skb); } /* * This should be easy, if there is something there we * return it, otherwise we block. */ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; unsigned int ulen, copied; int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); struct udp_mib __percpu *mib; bool checksum_valid = false; int is_udp4; if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); if (np->rxpmtu && np->rxopt.bits.rxpmtu) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, &off, &err); if (!skb) return err; ulen = udp6_skb_len(skb); copied = len; if (copied > ulen - off) copied = ulen - off; else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; is_udp4 = (skb->protocol == htons(ETH_P_IP)); mib = __UDPX_MIB(sk, is_udp4); /* * If checksum is needed at all, try to do it while copying the * data. If the data is truncated, or if we only want a partial * coverage checksum (UDP-Lite), do it before the copy. */ if (copied < ulen || peeking || (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { checksum_valid = udp_skb_csum_unnecessary(skb) || !__udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; } if (checksum_valid || udp_skb_csum_unnecessary(skb)) { if (udp_skb_is_linear(skb)) err = copy_linear_skb(skb, copied, off, &msg->msg_iter); else err = skb_copy_datagram_msg(skb, off, msg, copied); } else { err = skb_copy_and_csum_datagram_msg(skb, off, msg); if (err == -EINVAL) goto csum_copy_err; } if (unlikely(err)) { if (!peeking) { atomic_inc(&sk->sk_drops); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); return err; } if (!peeking) SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS); sock_recv_cmsgs(msg, sk, skb); /* Copy the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); sin6->sin6_family = AF_INET6; sin6->sin6_port = udp_hdr(skb)->source; sin6->sin6_flowinfo = 0; if (is_udp4) { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin6->sin6_addr); sin6->sin6_scope_id = 0; } else { sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); } *addr_len = sizeof(*sin6); BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, (struct sockaddr *)sin6, addr_len); } if (udp_test_bit(GRO_ENABLED, sk)) udp_cmsg_recv(msg, sk, skb); if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); if (is_udp4) { if (inet_cmsg_flags(inet)) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); } else { if (np->rxopt.all) ip6_datagram_recv_specific_ctl(sk, msg, skb); } err = copied; if (flags & MSG_TRUNC) err = ulen; skb_consume_udp(sk, skb, peeking ? -err : err); return err; csum_copy_err: if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, udp_skb_destructor)) { SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); /* starting over for a new packet, but check if we need to yield */ cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { static_branch_inc(&udpv6_encap_needed_key); } EXPORT_SYMBOL(udpv6_encap_enable); /* Handler for tunnels with arbitrary destination ports: no socket lookup, go * through error handlers in encapsulations looking for a match. */ static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { int i; for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); const struct ip6_tnl_encap_ops *encap; encap = rcu_dereference(ip6tun_encaps[i]); if (!encap) continue; handler = encap->err_handler; if (handler && !handler(skb, opt, type, code, offset, info)) return 0; } return -ENOENT; } /* Try to match ICMP errors to UDP tunnels by looking up a socket without * reversing source and destination port: this will match tunnels that force the * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that * lwtunnels might actually break this assumption by being configured with * different destination ports on endpoints, in this case we won't be able to * trace ICMP messages back to them. * * If this doesn't match any socket, probe tunnels with arbitrary destination * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port * we've sent packets to won't necessarily match the local destination port. * * Then ask the tunnel implementation to match the error against a valid * association. * * Return an error if we can't find a match, the socket if we need further * processing, zero otherwise. */ static struct sock *__udp6_lib_err_encap(struct net *net, const struct ipv6hdr *hdr, int offset, struct udphdr *uh, struct udp_table *udptable, struct sock *sk, struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, __be32 info) { int (*lookup)(struct sock *sk, struct sk_buff *skb); int network_offset, transport_offset; struct udp_sock *up; network_offset = skb_network_offset(skb); transport_offset = skb_transport_offset(skb); /* Network header needs to point to the outer IPv6 header inside ICMP */ skb_reset_network_header(skb); /* Transport header needs to point to the UDP header */ skb_set_transport_header(skb, offset); if (sk) { up = udp_sk(sk); lookup = READ_ONCE(up->encap_err_lookup); if (lookup && lookup(sk, skb)) sk = NULL; goto out; } sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source, &hdr->saddr, uh->dest, inet6_iif(skb), 0, udptable, skb); if (sk) { up = udp_sk(sk); lookup = READ_ONCE(up->encap_err_lookup); if (!lookup || lookup(sk, skb)) sk = NULL; } out: if (!sk) { sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code, offset, info)); } skb_set_transport_header(skb, transport_offset); skb_set_network_header(skb, network_offset); return sk; } int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info, struct udp_table *udptable) { struct ipv6_pinfo *np; const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct in6_addr *saddr = &hdr->saddr; const struct in6_addr *daddr = seg6_get_daddr(skb, opt) ? : &hdr->daddr; struct udphdr *uh = (struct udphdr *)(skb->data+offset); bool tunnel = false; struct sock *sk; int harderr; int err; struct net *net = dev_net(skb->dev); sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), inet6_sdif(skb), udptable, NULL); if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udpv6_encap_needed_key)) { sk = __udp6_lib_err_encap(net, hdr, offset, uh, udptable, sk, skb, opt, type, code, info); if (!sk) return 0; } else sk = ERR_PTR(-ENOENT); if (IS_ERR(sk)) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return PTR_ERR(sk); } tunnel = true; } harderr = icmpv6_err_convert(type, code, &err); np = inet6_sk(sk); if (type == ICMPV6_PKT_TOOBIG) { if (!ip6_sk_accept_pmtu(sk)) goto out; ip6_sk_update_pmtu(skb, sk, info); if (READ_ONCE(np->pmtudisc) != IPV6_PMTUDISC_DONT) harderr = 1; } if (type == NDISC_REDIRECT) { if (tunnel) { ip6_redirect(skb, sock_net(sk), inet6_iif(skb), READ_ONCE(sk->sk_mark), sk->sk_uid); } else { ip6_sk_redirect(skb, sk); } goto out; } /* Tunnels don't have an application socket: don't pass errors back */ if (tunnel) { if (udp_sk(sk)->encap_err_rcv) udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); goto out; } if (!inet6_test_bit(RECVERR6, sk)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); } sk->sk_err = err; sk_error_report(sk); out: return 0; } static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; if (!ipv6_addr_any(&sk->sk_v6_daddr)) { sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } else { sk_mark_napi_id_once(sk, skb); } rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); enum skb_drop_reason drop_reason; /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) { UDP6_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; } else { UDP6_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS, is_udplite); drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); trace_udp_fail_queue_rcv_skb(rc, sk, skb); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } return 0; } static __inline__ int udpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { return __udp6_lib_err(skb, opt, type, code, offset, info, dev_net(skb->dev)->ipv4.udp_table); } static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto drop; } nf_reset_ct(skb); if (static_branch_unlikely(&udpv6_encap_needed_key) && READ_ONCE(up->encap_type)) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* * This is an encapsulation socket so pass the skb to * the socket's udp_encap_rcv() hook. Otherwise, just * fall through and pass this up the UDP socket. * up->encap_rcv() returns the following value: * =0 if skb was successfully passed to the encap * handler or was discarded by it. * >0 if skb should be passed on to UDP. * <0 if skb should be resubmitted as proto -N */ /* if we're overly short, let UDP handle it */ encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret; /* Verify checksum before giving to encap */ if (udp_lib_checksum_complete(skb)) goto csum_error; ret = encap_rcv(sk, skb); if (ret <= 0) { __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); return -ret; } } /* FALLTHROUGH -- it's a UDP Packet */ } /* * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). */ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) { u16 pcrlen = READ_ONCE(up->pcrlen); if (pcrlen == 0) { /* full coverage was set */ net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n", UDP_SKB_CB(skb)->cscov, skb->len); goto drop; } if (UDP_SKB_CB(skb)->cscov < pcrlen) { net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n", UDP_SKB_CB(skb)->cscov, pcrlen); goto drop; } } prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) goto csum_error; if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto drop; } udp_csum_pull_header(skb); skb_dst_drop(skb); return __udpv6_queue_rcv_skb(sk, skb); csum_error: drop_reason = SKB_DROP_REASON_UDP_CSUM; __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff *next, *segs; int ret; if (likely(!udp_unexpected_gso(sk, skb))) return udpv6_queue_rcv_one_skb(sk, skb); __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, false); skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); udp_post_segment_fix_csum(skb); ret = udpv6_queue_rcv_one_skb(sk, skb); if (ret > 0) ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret, true); } return 0; } static bool __udp_v6_is_mcast_sock(struct net *net, const struct sock *sk, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, int dif, int sdif, unsigned short hnum) { const struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) return false; if (udp_sk(sk)->udp_port_hash != hnum || sk->sk_family != PF_INET6 || (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(&sk->sk_v6_daddr) && !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || !udp_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif) || (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))) return false; if (!inet6_mc_check(sk, loc_addr, rmt_addr)) return false; return true; } static void udp6_csum_zero_error(struct sk_buff *skb) { /* RFC 2460 section 8.1 says that we SHOULD log * this error. Well, it is reasonable. */ net_dbg_ratelimited("IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n", &ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source), &ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest)); } /* * Note: called only from the BH handler context, * so we don't need to lock the hashes. */ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, struct udp_table *udptable, int proto) { struct sock *sk, *first = NULL; const struct udphdr *uh = udp_hdr(skb); unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); int dif = inet6_iif(skb); int sdif = inet6_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; if (use_hash2) { hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) & udptable->mask; hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, uh->source, saddr, dif, sdif, hnum)) continue; /* If zero checksum and no_check is not on for * the socket then skip it. */ if (!uh->check && !udp_get_no_check6_rx(sk)) continue; if (!first) { first = sk; continue; } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { atomic_inc(&sk->sk_drops); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, IS_UDPLITE(sk)); continue; } if (udpv6_queue_rcv_skb(sk, nskb) > 0) consume_skb(nskb); } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { hash2 = hash2_any; goto start_lookup; } if (first) { if (udpv6_queue_rcv_skb(first, skb) > 0) consume_skb(skb); } else { kfree_skb(skb); __UDP6_INC_STATS(net, UDP_MIB_IGNOREDMULTI, proto == IPPROTO_UDPLITE); } return 0; } static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { if (udp_sk_rx_dst_set(sk, dst)) sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } /* wrapper for udp_queue_rcv_skb tacking care of csum conversion and * return code conversion for ip layer consumption */ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, struct udphdr *uh) { int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo); ret = udpv6_queue_rcv_skb(sk, skb); /* a return value > 0 means to resubmit the input */ if (ret > 0) return ret; return 0; } int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); struct sock *sk = NULL; struct udphdr *uh; bool refcounted; u32 ulen = 0; if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto discard; saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; uh = udp_hdr(skb); ulen = ntohs(uh->len); if (ulen > skb->len) goto short_packet; if (proto == IPPROTO_UDP) { /* UDP validates ulen. */ /* Check for jumbo payload */ if (ulen == 0) ulen = skb->len; if (ulen < sizeof(*uh)) goto short_packet; if (ulen < skb->len) { if (pskb_trim_rcsum(skb, ulen)) goto short_packet; saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; uh = udp_hdr(skb); } } if (udp6_csum_init(skb, uh, proto)) goto csum_error; /* Check if the socket is already available, e.g. due to early demux */ sk = inet6_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, &refcounted, udp6_ehashfn); if (IS_ERR(sk)) goto no_sk; if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp6_sk_rx_dst_set(sk, dst); if (!uh->check && !udp_get_no_check6_rx(sk)) { if (refcounted) sock_put(sk); goto report_csum_error; } ret = udp6_unicast_rcv_skb(sk, skb, uh); if (refcounted) sock_put(sk); return ret; } /* * Multicast receive code */ if (ipv6_addr_is_multicast(daddr)) return __udp6_lib_mcast_deliver(net, skb, saddr, daddr, udptable, proto); /* Unicast */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) { if (!uh->check && !udp_get_no_check6_rx(sk)) goto report_csum_error; return udp6_unicast_rcv_skb(sk, skb, uh); } no_sk: reason = SKB_DROP_REASON_NO_SOCKET; if (!uh->check) goto report_csum_error; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; nf_reset_ct(skb); if (udp_lib_checksum_complete(skb)) goto csum_error; __UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); sk_skb_reason_drop(sk, skb, reason); return 0; short_packet: if (reason == SKB_DROP_REASON_NOT_SPECIFIED) reason = SKB_DROP_REASON_PKT_TOO_SMALL; net_dbg_ratelimited("UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n", proto == IPPROTO_UDPLITE ? "-Lite" : "", saddr, ntohs(uh->source), ulen, skb->len, daddr, ntohs(uh->dest)); goto discard; report_csum_error: udp6_csum_zero_error(skb); csum_error: if (reason == SKB_DROP_REASON_NOT_SPECIFIED) reason = SKB_DROP_REASON_UDP_CSUM; __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: __UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); sk_skb_reason_drop(sk, skb, reason); return 0; } static struct sock *__udp6_lib_demux_lookup(struct net *net, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, int dif, int sdif) { struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); struct udp_hslot *hslot2; unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (sk->sk_state == TCP_ESTABLISHED && inet6_match(net, sk, rmt_addr, loc_addr, ports, dif, sdif)) return sk; /* Only check first socket in chain */ break; } return NULL; } void udp_v6_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct udphdr *uh; struct sock *sk; struct dst_entry *dst; int dif = skb->dev->ifindex; int sdif = inet6_sdif(skb); if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) return; uh = udp_hdr(skb); if (skb->pkt_type == PACKET_HOST) sk = __udp6_lib_demux_lookup(net, uh->dest, &ipv6_hdr(skb)->daddr, uh->source, &ipv6_hdr(skb)->saddr, dif, sdif); else return; if (!sk) return; skb->sk = sk; DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); skb->destructor = sock_pfree; dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst) { /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); } } INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb) { return __udp6_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP); } /* * Throw away all pending data and cancel the corking. Socket is locked. */ static void udp_v6_flush_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); if (up->pending == AF_INET) udp_flush_pending_frames(sk); else if (up->pending) { up->len = 0; WRITE_ONCE(up->pending, 0); ip6_flush_pending_frames(sk); } } static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { if (addr_len < offsetofend(struct sockaddr, sa_family)) return -EINVAL; /* The following checks are replicated from __ip6_datagram_connect() * and intended to prevent BPF program called below from accessing * bytes that are out of the bound specified by user in addr_len. */ if (uaddr->sa_family == AF_INET) { if (ipv6_only_sock(sk)) return -EAFNOSUPPORT; return udp_pre_connect(sk, uaddr, addr_len); } if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); } static int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip6_datagram_connect(sk, uaddr, addr_len); if (!res) udp6_hash4(sk); release_sock(sk); return res; } /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) * @saddr: source address * @daddr: destination address * @len: length of packet */ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len) { unsigned int offset; struct udphdr *uh = udp_hdr(skb); struct sk_buff *frags = skb_shinfo(skb)->frag_list; __wsum csum = 0; if (!frags) { /* Only one fragment on the socket. */ skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, 0); } else { /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); csum = skb->csum; skb->ip_summed = CHECKSUM_NONE; do { csum = csum_add(csum, frags->csum); } while ((frags = frags->next)); uh->check = csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } } /* * Sending */ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, struct inet_cork *cork) { struct sock *sk = skb->sk; struct udphdr *uh; int err = 0; int is_udplite = IS_UDPLITE(sk); __wsum csum = 0; int offset = skb_transport_offset(skb); int len = skb->len - offset; int datalen = len - sizeof(*uh); /* * Create a UDP header */ uh = udp_hdr(skb); uh->source = fl6->fl6_sport; uh->dest = fl6->fl6_dport; uh->len = htons(len); uh->check = 0; if (cork->gso_size) { const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); return -EINVAL; } if (udp_get_no_check6_tx(sk)) { kfree_skb(skb); return -EINVAL; } if (is_udplite || dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } if (datalen > cork->gso_size) { skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, cork->gso_size); /* Don't checksum the payload, skb will get segmented */ goto csum_partial; } } if (is_udplite) csum = udplite_csum(skb); else if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ csum_partial: udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len); goto send; } else csum = udp_csum(skb); /* add protocol-dependent pseudo-header */ uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, len, fl6->flowi6_proto, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; send: err = ip6_send_skb(skb); if (err) { if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) { UDP6_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; } } else { UDP6_INC_STATS(sock_net(sk), UDP_MIB_OUTDATAGRAMS, is_udplite); } return err; } static int udp_v6_push_pending_frames(struct sock *sk) { struct sk_buff *skb; struct udp_sock *up = udp_sk(sk); int err = 0; if (up->pending == AF_INET) return udp_push_pending_frames(sk); skb = ip6_finish_skb(sk); if (!skb) goto out; err = udp_v6_send_skb(skb, &inet_sk(sk)->cork.fl.u.ip6, &inet_sk(sk)->cork.base); out: up->len = 0; WRITE_ONCE(up->pending, 0); return err; } int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_txoptions *opt = NULL; struct ipv6_txoptions *opt_to_free = NULL; struct ip6_flowlabel *flowlabel = NULL; struct inet_cork_full cork; struct flowi6 *fl6 = &cork.fl.u.ip6; struct dst_entry *dst; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; bool connected = false; int ulen = len; int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); /* destination address check */ if (sin6) { if (addr_len < offsetof(struct sockaddr, sa_data)) return -EINVAL; switch (sin6->sin6_family) { case AF_INET6: if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; daddr = &sin6->sin6_addr; if (ipv6_addr_any(daddr) && ipv6_addr_v4mapped(&np->saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), daddr); break; case AF_INET: goto do_udp_sendmsg; case AF_UNSPEC: msg->msg_name = sin6 = NULL; msg->msg_namelen = addr_len = 0; daddr = NULL; break; default: return -EINVAL; } } else if (!READ_ONCE(up->pending)) { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; } else daddr = NULL; if (daddr) { if (ipv6_addr_v4mapped(daddr)) { struct sockaddr_in sin; sin.sin_family = AF_INET; sin.sin_port = sin6 ? sin6->sin6_port : inet->inet_dport; sin.sin_addr.s_addr = daddr->s6_addr32[3]; msg->msg_name = &sin; msg->msg_namelen = sizeof(sin); do_udp_sendmsg: err = ipv6_only_sock(sk) ? -ENETUNREACH : udp_sendmsg(sk, msg, len); msg->msg_name = sin6; msg->msg_namelen = addr_len; return err; } } /* Rough check on arithmetic overflow, better check is made in ip6_append_data(). */ if (len > INT_MAX - sizeof(struct udphdr)) return -EMSGSIZE; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; if (READ_ONCE(up->pending)) { if (READ_ONCE(up->pending) == AF_INET) return udp_sendmsg(sk, msg, len); /* * There are pending frames. * The socket lock must be held while it's corked. */ lock_sock(sk); if (likely(up->pending)) { if (unlikely(up->pending != AF_INET6)) { release_sock(sk); return -EAFNOSUPPORT; } dst = NULL; goto do_append_data; } release_sock(sk); } ulen += sizeof(struct udphdr); memset(fl6, 0, sizeof(*fl6)); if (sin6) { if (sin6->sin6_port == 0) return -EINVAL; fl6->fl6_dport = sin6->sin6_port; daddr = &sin6->sin6_addr; if (inet6_test_bit(SNDFLOW, sk)) { fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } /* * Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr))) fl6->flowi6_oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; fl6->fl6_dport = inet->inet_dport; daddr = &sk->sk_v6_daddr; fl6->flowlabel = np->flow_label; connected = true; } if (!fl6->flowi6_oif) fl6->flowi6_oif = READ_ONCE(sk->sk_bound_dev_if); if (!fl6->flowi6_oif) fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; fl6->flowi6_uid = sk->sk_uid; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(*opt); ipc6.opt = opt; err = udp_cmsg_send(sk, msg, &ipc6.gso_size); if (err > 0) { err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, fl6, &ipc6); connected = false; } if (err < 0) { fl6_sock_release(flowlabel); return err; } if ((fl6->flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; } if (!opt) { opt = txopt_get(np); opt_to_free = opt; } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); ipc6.opt = opt; fl6->flowi6_proto = sk->sk_protocol; fl6->flowi6_mark = ipc6.sockc.mark; fl6->daddr = *daddr; if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr)) fl6->saddr = np->saddr; fl6->fl6_sport = inet->inet_sport; if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &addr_len, &fl6->saddr); if (err) goto out_no_dst; if (sin6) { if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { /* BPF program rewrote IPv6-only by IPv4-mapped * IPv6. It's currently unsupported. */ err = -ENOTSUPP; goto out_no_dst; } if (sin6->sin6_port == 0) { /* BPF program set invalid port. Reject it. */ err = -EINVAL; goto out_no_dst; } fl6->fl6_dport = sin6->sin6_port; fl6->daddr = sin6->sin6_addr; } } if (ipv6_addr_any(&fl6->daddr)) fl6->daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ final_p = fl6_update_dst(fl6, opt, &final); if (final_p) connected = false; if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) { fl6->flowi6_oif = READ_ONCE(np->mcast_oif); connected = false; } else if (!fl6->flowi6_oif) fl6->flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel); dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto out; } if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, fl6, dst); if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: /* Lockless fast path for the non-corking case */ if (!corkreq) { struct sk_buff *skb; skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, dst_rt6_info(dst), msg->msg_flags, &cork); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_v6_send_skb(skb, fl6, &cork.base); /* ip6_make_skb steals dst reference */ goto out_no_dst; } lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ /* ... which is an evident application bug. --ANK */ release_sock(sk); net_dbg_ratelimited("udp cork app bug 2\n"); err = -EINVAL; goto out; } WRITE_ONCE(up->pending, AF_INET6); do_append_data: up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, fl6, dst_rt6_info(dst), corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) err = udp_v6_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) WRITE_ONCE(up->pending, 0); if (err > 0) err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0; release_sock(sk); out: dst_release(dst); out_no_dst: fl6_sock_release(flowlabel); txopt_put(opt_to_free); if (!err) return len; /* * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting * ENOBUFS might not be good (it's not tunable per se), but otherwise * we don't have a good statistic (IpOutDiscards but it can be too many * things). We could add another new stat but at least for now that * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { UDP6_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); } return err; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(dst, &fl6->daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } EXPORT_SYMBOL(udpv6_sendmsg); static void udpv6_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) return; lock_sock(sk); if (up->pending && !udp_test_bit(CORK, sk)) udp_v6_push_pending_frames(sk); release_sock(sk); } void udpv6_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); lock_sock(sk); /* protects from races with udp_abort() */ sock_set_flag(sk, SOCK_DEAD); udp_v6_flush_pending_frames(sk); release_sock(sk); if (static_branch_unlikely(&udpv6_encap_needed_key)) { if (up->encap_type) { void (*encap_destroy)(struct sock *sk); encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) encap_destroy(sk); } if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); udp_tunnel_cleanup_gro(sk); } } } /* * Socket option code for UDP */ int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_v6_push_pending_frames); return ipv6_setsockopt(sk, level, optname, optval, optlen); } int udpv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_getsockopt(sk, level, optname, optval, optlen); return ipv6_getsockopt(sk, level, optname, optval, optlen); } /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS int udp6_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); } else { int bucket = ((struct udp_iter_state *)seq->private)->bucket; const struct inet_sock *inet = inet_sk((const struct sock *)v); __u16 srcp = ntohs(inet->inet_sport); __u16 destp = ntohs(inet->inet_dport); __ip6_dgram_sock_seq_show(seq, v, srcp, destp, udp_rqueue_get(v), bucket); } return 0; } const struct seq_operations udp6_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, .stop = udp_seq_stop, .show = udp6_seq_show, }; EXPORT_SYMBOL(udp6_seq_ops); static struct udp_seq_afinfo udp6_seq_afinfo = { .family = AF_INET6, .udp_table = NULL, }; int __net_init udp6_proc_init(struct net *net) { if (!proc_create_net_data("udp6", 0444, net->proc_net, &udp6_seq_ops, sizeof(struct udp_iter_state), &udp6_seq_afinfo)) return -ENOMEM; return 0; } void udp6_proc_exit(struct net *net) { remove_proc_entry("udp6", net->proc_net); } #endif /* CONFIG_PROC_FS */ /* ------------------------------------------------------------------------ */ struct proto udpv6_prot = { .name = "UDPv6", .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udpv6_pre_connect, .connect = udpv6_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udpv6_init_sock, .destroy = udpv6_destroy_sock, .setsockopt = udpv6_setsockopt, .getsockopt = udpv6_getsockopt, .sendmsg = udpv6_sendmsg, .recvmsg = udpv6_recvmsg, .splice_eof = udpv6_splice_eof, .release_cb = ip6_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, .put_port = udp_lib_unhash, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = udp_bpf_update_proto, #endif .memory_allocated = &udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = NULL, .diag_destroy = udp_abort, }; static struct inet_protosw udpv6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udpv6_prot, .ops = &inet6_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }; int __init udpv6_init(void) { int ret; net_hotdata.udpv6_protocol = (struct inet6_protocol) { .handler = udpv6_rcv, .err_handler = udpv6_err, .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; ret = inet6_add_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP); if (ret) goto out; ret = inet6_register_protosw(&udpv6_protosw); if (ret) goto out_udpv6_protocol; out: return ret; out_udpv6_protocol: inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP); goto out; } void udpv6_exit(void) { inet6_unregister_protosw(&udpv6_protosw); inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP); }
827 66 53 13 6 5 59 19 57 1 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 // SPDX-License-Identifier: GPL-2.0 #include <net/ip.h> #include <net/udp.h> #include <net/udplite.h> #include <asm/checksum.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum) { int carry; __u32 ulen; __u32 uproto; __u32 sum = (__force u32)csum; sum += (__force u32)saddr->s6_addr32[0]; carry = (sum < (__force u32)saddr->s6_addr32[0]); sum += carry; sum += (__force u32)saddr->s6_addr32[1]; carry = (sum < (__force u32)saddr->s6_addr32[1]); sum += carry; sum += (__force u32)saddr->s6_addr32[2]; carry = (sum < (__force u32)saddr->s6_addr32[2]); sum += carry; sum += (__force u32)saddr->s6_addr32[3]; carry = (sum < (__force u32)saddr->s6_addr32[3]); sum += carry; sum += (__force u32)daddr->s6_addr32[0]; carry = (sum < (__force u32)daddr->s6_addr32[0]); sum += carry; sum += (__force u32)daddr->s6_addr32[1]; carry = (sum < (__force u32)daddr->s6_addr32[1]); sum += carry; sum += (__force u32)daddr->s6_addr32[2]; carry = (sum < (__force u32)daddr->s6_addr32[2]); sum += carry; sum += (__force u32)daddr->s6_addr32[3]; carry = (sum < (__force u32)daddr->s6_addr32[3]); sum += carry; ulen = (__force u32)htonl((__u32) len); sum += ulen; carry = (sum < ulen); sum += carry; uproto = (__force u32)htonl(proto); sum += uproto; carry = (sum < uproto); sum += carry; return csum_fold((__force __wsum)sum); } EXPORT_SYMBOL(csum_ipv6_magic); #endif int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { int err; UDP_SKB_CB(skb)->partial_cov = 0; UDP_SKB_CB(skb)->cscov = skb->len; if (proto == IPPROTO_UDPLITE) { err = udplite_checksum_init(skb, uh); if (err) return err; if (UDP_SKB_CB(skb)->partial_cov) { skb->csum = ip6_compute_pseudo(skb, proto); return 0; } } /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) * we accept a checksum of zero here. When we find the socket * for the UDP packet we'll check if that socket allows zero checksum * for IPv6 (set by socket option). * * Note, we are only interested in != 0 or == 0, thus the * force to int. */ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, ip6_compute_pseudo); if (err) return err; if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { /* If SW calculated the value, we know it's bad */ if (skb->csum_complete_sw) return 1; /* HW says the value is bad. Let's validate that. * skb->csum is no longer the full packet checksum, * so don't treat is as such. */ skb_checksum_complete_unset(skb); } return 0; } EXPORT_SYMBOL(udp6_csum_init); /* Function to set UDP checksum for an IPv6 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len) { struct udphdr *uh = udp_hdr(skb); if (nocheck) uh->check = 0; else if (skb_is_gso(skb)) uh->check = ~udp_v6_check(len, saddr, daddr, 0); else if (skb->ip_summed == CHECKSUM_PARTIAL) { uh->check = 0; uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v6_check(len, saddr, daddr, 0); } } EXPORT_SYMBOL(udp6_set_csum);
2 2 9 9 5 5 5 6 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 // SPDX-License-Identifier: GPL-2.0-only /* * Stream Parser * * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */ #include <linux/bpf.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/file.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/init.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/poll.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/uaccess.h> #include <linux/workqueue.h> #include <net/strparser.h> #include <net/netns/generic.h> #include <net/sock.h> static struct workqueue_struct *strp_wq; static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) { return (struct _strp_msg *)((void *)skb->cb + offsetof(struct sk_skb_cb, strp)); } /* Lower lock held */ static void strp_abort_strp(struct strparser *strp, int err) { /* Unrecoverable error in receive */ cancel_delayed_work(&strp->msg_timer_work); if (strp->stopped) return; strp->stopped = 1; if (strp->sk) { struct sock *sk = strp->sk; /* Report an error on the lower socket */ sk->sk_err = -err; sk_error_report(sk); } } static void strp_start_timer(struct strparser *strp, long timeo) { if (timeo && timeo != LONG_MAX) mod_delayed_work(strp_wq, &strp->msg_timer_work, timeo); } /* Lower lock held */ static void strp_parser_err(struct strparser *strp, int err, read_descriptor_t *desc) { desc->error = err; kfree_skb(strp->skb_head); strp->skb_head = NULL; strp->cb.abort_parser(strp, err); } static inline int strp_peek_len(struct strparser *strp) { if (strp->sk) { struct socket *sock = strp->sk->sk_socket; return sock->ops->peek_len(sock); } /* If we don't have an associated socket there's nothing to peek. * Return int max to avoid stopping the strparser. */ return INT_MAX; } /* Lower socket lock held */ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, unsigned int orig_offset, size_t orig_len, size_t max_msg_size, long timeo) { struct strparser *strp = (struct strparser *)desc->arg.data; struct _strp_msg *stm; struct sk_buff *head, *skb; size_t eaten = 0, cand_len; ssize_t extra; int err; bool cloned_orig = false; if (strp->paused) return 0; head = strp->skb_head; if (head) { /* Message already in progress */ if (unlikely(orig_offset)) { /* Getting data with a non-zero offset when a message is * in progress is not expected. If it does happen, we * need to clone and pull since we can't deal with * offsets in the skbs for a message expect in the head. */ orig_skb = skb_clone(orig_skb, GFP_ATOMIC); if (!orig_skb) { STRP_STATS_INCR(strp->stats.mem_fail); desc->error = -ENOMEM; return 0; } if (!pskb_pull(orig_skb, orig_offset)) { STRP_STATS_INCR(strp->stats.mem_fail); kfree_skb(orig_skb); desc->error = -ENOMEM; return 0; } cloned_orig = true; orig_offset = 0; } if (!strp->skb_nextp) { /* We are going to append to the frags_list of head. * Need to unshare the frag_list. */ err = skb_unclone(head, GFP_ATOMIC); if (err) { STRP_STATS_INCR(strp->stats.mem_fail); desc->error = err; return 0; } if (unlikely(skb_shinfo(head)->frag_list)) { /* We can't append to an sk_buff that already * has a frag_list. We create a new head, point * the frag_list of that to the old head, and * then are able to use the old head->next for * appending to the message. */ if (WARN_ON(head->next)) { desc->error = -EINVAL; return 0; } skb = alloc_skb_for_msg(head); if (!skb) { STRP_STATS_INCR(strp->stats.mem_fail); desc->error = -ENOMEM; return 0; } strp->skb_nextp = &head->next; strp->skb_head = skb; head = skb; } else { strp->skb_nextp = &skb_shinfo(head)->frag_list; } } } while (eaten < orig_len) { /* Always clone since we will consume something */ skb = skb_clone(orig_skb, GFP_ATOMIC); if (!skb) { STRP_STATS_INCR(strp->stats.mem_fail); desc->error = -ENOMEM; break; } cand_len = orig_len - eaten; head = strp->skb_head; if (!head) { head = skb; strp->skb_head = head; /* Will set skb_nextp on next packet if needed */ strp->skb_nextp = NULL; stm = _strp_msg(head); memset(stm, 0, sizeof(*stm)); stm->strp.offset = orig_offset + eaten; } else { /* Unclone if we are appending to an skb that we * already share a frag_list with. */ if (skb_has_frag_list(skb)) { err = skb_unclone(skb, GFP_ATOMIC); if (err) { STRP_STATS_INCR(strp->stats.mem_fail); desc->error = err; break; } } stm = _strp_msg(head); *strp->skb_nextp = skb; strp->skb_nextp = &skb->next; head->data_len += skb->len; head->len += skb->len; head->truesize += skb->truesize; } if (!stm->strp.full_len) { ssize_t len; len = (*strp->cb.parse_msg)(strp, head); if (!len) { /* Need more header to determine length */ if (!stm->accum_len) { /* Start RX timer for new message */ strp_start_timer(strp, timeo); } stm->accum_len += cand_len; eaten += cand_len; STRP_STATS_INCR(strp->stats.need_more_hdr); WARN_ON(eaten != orig_len); break; } else if (len < 0) { if (len == -ESTRPIPE && stm->accum_len) { len = -ENODATA; strp->unrecov_intr = 1; } else { strp->interrupted = 1; } strp_parser_err(strp, len, desc); break; } else if (len > max_msg_size) { /* Message length exceeds maximum allowed */ STRP_STATS_INCR(strp->stats.msg_too_big); strp_parser_err(strp, -EMSGSIZE, desc); break; } else if (len <= (ssize_t)head->len - skb->len - stm->strp.offset) { /* Length must be into new skb (and also * greater than zero) */ STRP_STATS_INCR(strp->stats.bad_hdr_len); strp_parser_err(strp, -EPROTO, desc); break; } stm->strp.full_len = len; } extra = (ssize_t)(stm->accum_len + cand_len) - stm->strp.full_len; if (extra < 0) { /* Message not complete yet. */ if (stm->strp.full_len - stm->accum_len > strp_peek_len(strp)) { /* Don't have the whole message in the socket * buffer. Set strp->need_bytes to wait for * the rest of the message. Also, set "early * eaten" since we've already buffered the skb * but don't consume yet per strp_read_sock. */ if (!stm->accum_len) { /* Start RX timer for new message */ strp_start_timer(strp, timeo); } stm->accum_len += cand_len; eaten += cand_len; strp->need_bytes = stm->strp.full_len - stm->accum_len; STRP_STATS_ADD(strp->stats.bytes, cand_len); desc->count = 0; /* Stop reading socket */ break; } stm->accum_len += cand_len; eaten += cand_len; WARN_ON(eaten != orig_len); break; } /* Positive extra indicates more bytes than needed for the * message */ WARN_ON(extra > cand_len); eaten += (cand_len - extra); /* Hurray, we have a new message! */ cancel_delayed_work(&strp->msg_timer_work); strp->skb_head = NULL; strp->need_bytes = 0; STRP_STATS_INCR(strp->stats.msgs); /* Give skb to upper layer */ strp->cb.rcv_msg(strp, head); if (unlikely(strp->paused)) { /* Upper layer paused strp */ break; } } if (cloned_orig) kfree_skb(orig_skb); STRP_STATS_ADD(strp->stats.bytes, eaten); return eaten; } int strp_process(struct strparser *strp, struct sk_buff *orig_skb, unsigned int orig_offset, size_t orig_len, size_t max_msg_size, long timeo) { read_descriptor_t desc; /* Dummy arg to strp_recv */ desc.arg.data = strp; return __strp_recv(&desc, orig_skb, orig_offset, orig_len, max_msg_size, timeo); } EXPORT_SYMBOL_GPL(strp_process); static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, unsigned int orig_offset, size_t orig_len) { struct strparser *strp = (struct strparser *)desc->arg.data; return __strp_recv(desc, orig_skb, orig_offset, orig_len, strp->sk->sk_rcvbuf, strp->sk->sk_rcvtimeo); } static int default_read_sock_done(struct strparser *strp, int err) { return err; } /* Called with lock held on lower socket */ static int strp_read_sock(struct strparser *strp) { struct socket *sock = strp->sk->sk_socket; read_descriptor_t desc; if (unlikely(!sock || !sock->ops)) return -EBUSY; if (unlikely(!strp->cb.read_sock && !sock->ops->read_sock)) return -EBUSY; desc.arg.data = strp; desc.error = 0; desc.count = 1; /* give more than one skb per call */ /* sk should be locked here, so okay to do read_sock */ if (strp->cb.read_sock) strp->cb.read_sock(strp, &desc, strp_recv); else sock->ops->read_sock(strp->sk, &desc, strp_recv); desc.error = strp->cb.read_sock_done(strp, desc.error); return desc.error; } /* Lower sock lock held */ void strp_data_ready(struct strparser *strp) { if (unlikely(strp->stopped) || strp->paused) return; /* This check is needed to synchronize with do_strp_work. * do_strp_work acquires a process lock (lock_sock) whereas * the lock held here is bh_lock_sock. The two locks can be * held by different threads at the same time, but bh_lock_sock * allows a thread in BH context to safely check if the process * lock is held. In this case, if the lock is held, queue work. */ if (sock_owned_by_user_nocheck(strp->sk)) { queue_work(strp_wq, &strp->work); return; } if (strp->need_bytes) { if (strp_peek_len(strp) < strp->need_bytes) return; } if (strp_read_sock(strp) == -ENOMEM) queue_work(strp_wq, &strp->work); } EXPORT_SYMBOL_GPL(strp_data_ready); static void do_strp_work(struct strparser *strp) { /* We need the read lock to synchronize with strp_data_ready. We * need the socket lock for calling strp_read_sock. */ strp->cb.lock(strp); if (unlikely(strp->stopped)) goto out; if (strp->paused) goto out; if (strp_read_sock(strp) == -ENOMEM) queue_work(strp_wq, &strp->work); out: strp->cb.unlock(strp); } static void strp_work(struct work_struct *w) { do_strp_work(container_of(w, struct strparser, work)); } static void strp_msg_timeout(struct work_struct *w) { struct strparser *strp = container_of(w, struct strparser, msg_timer_work.work); /* Message assembly timed out */ STRP_STATS_INCR(strp->stats.msg_timeouts); strp->cb.lock(strp); strp->cb.abort_parser(strp, -ETIMEDOUT); strp->cb.unlock(strp); } static void strp_sock_lock(struct strparser *strp) { lock_sock(strp->sk); } static void strp_sock_unlock(struct strparser *strp) { release_sock(strp->sk); } int strp_init(struct strparser *strp, struct sock *sk, const struct strp_callbacks *cb) { if (!cb || !cb->rcv_msg || !cb->parse_msg) return -EINVAL; /* The sk (sock) arg determines the mode of the stream parser. * * If the sock is set then the strparser is in receive callback mode. * The upper layer calls strp_data_ready to kick receive processing * and strparser calls the read_sock function on the socket to * get packets. * * If the sock is not set then the strparser is in general mode. * The upper layer calls strp_process for each skb to be parsed. */ if (!sk) { if (!cb->lock || !cb->unlock) return -EINVAL; } memset(strp, 0, sizeof(*strp)); strp->sk = sk; strp->cb.lock = cb->lock ? : strp_sock_lock; strp->cb.unlock = cb->unlock ? : strp_sock_unlock; strp->cb.rcv_msg = cb->rcv_msg; strp->cb.parse_msg = cb->parse_msg; strp->cb.read_sock = cb->read_sock; strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp; INIT_DELAYED_WORK(&strp->msg_timer_work, strp_msg_timeout); INIT_WORK(&strp->work, strp_work); return 0; } EXPORT_SYMBOL_GPL(strp_init); /* Sock process lock held (lock_sock) */ void __strp_unpause(struct strparser *strp) { strp->paused = 0; if (strp->need_bytes) { if (strp_peek_len(strp) < strp->need_bytes) return; } strp_read_sock(strp); } EXPORT_SYMBOL_GPL(__strp_unpause); void strp_unpause(struct strparser *strp) { strp->paused = 0; /* Sync setting paused with RX work */ smp_mb(); queue_work(strp_wq, &strp->work); } EXPORT_SYMBOL_GPL(strp_unpause); /* strp must already be stopped so that strp_recv will no longer be called. * Note that strp_done is not called with the lower socket held. */ void strp_done(struct strparser *strp) { WARN_ON(!strp->stopped); cancel_delayed_work_sync(&strp->msg_timer_work); cancel_work_sync(&strp->work); if (strp->skb_head) { kfree_skb(strp->skb_head); strp->skb_head = NULL; } } EXPORT_SYMBOL_GPL(strp_done); void strp_stop(struct strparser *strp) { strp->stopped = 1; } EXPORT_SYMBOL_GPL(strp_stop); void strp_check_rcv(struct strparser *strp) { queue_work(strp_wq, &strp->work); } EXPORT_SYMBOL_GPL(strp_check_rcv); static int __init strp_dev_init(void) { BUILD_BUG_ON(sizeof(struct sk_skb_cb) > sizeof_field(struct sk_buff, cb)); strp_wq = create_singlethread_workqueue("kstrp"); if (unlikely(!strp_wq)) return -ENOMEM; return 0; } device_initcall(strp_dev_init);
8 5 3 3 3 3 2 3 5 2 5 2 6 1 5 1 1 1 1 1 2 1 17 5 12 4 4 4 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 // SPDX-License-Identifier: GPL-2.0-only #include <linux/phy.h> #include <linux/ethtool_netlink.h> #include <net/netdev_lock.h> #include "netlink.h" #include "common.h" /* 802.3 standard allows 100 meters for BaseT cables. However longer * cables might work, depending on the quality of the cables and the * PHY. So allow testing for up to 150 meters. */ #define MAX_CABLE_LENGTH_CM (150 * 100) const struct nla_policy ethnl_cable_test_act_policy[] = { [ETHTOOL_A_CABLE_TEST_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), }; static int ethnl_cable_test_started(struct phy_device *phydev, u8 cmd) { struct sk_buff *skb; int err = -ENOMEM; void *ehdr; skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto out; ehdr = ethnl_bcastmsg_put(skb, cmd); if (!ehdr) { err = -EMSGSIZE; goto out; } err = ethnl_fill_reply_header(skb, phydev->attached_dev, ETHTOOL_A_CABLE_TEST_NTF_HEADER); if (err) goto out; err = nla_put_u8(skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS, ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED); if (err) goto out; genlmsg_end(skb, ehdr); return ethnl_multicast(skb, phydev->attached_dev); out: nlmsg_free(skb); phydev_err(phydev, "%s: Error %pe\n", __func__, ERR_PTR(err)); return err; } int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) { struct ethnl_req_info req_info = {}; const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; struct phy_device *phydev; struct net_device *dev; int ret; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_CABLE_TEST_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; rtnl_lock(); netdev_lock_ops(dev); phydev = ethnl_req_get_phydev(&req_info, tb, ETHTOOL_A_CABLE_TEST_HEADER, info->extack); if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; goto out_unlock; } ops = ethtool_phy_ops; if (!ops || !ops->start_cable_test) { ret = -EOPNOTSUPP; goto out_unlock; } ret = ethnl_ops_begin(dev); if (ret < 0) goto out_unlock; ret = ops->start_cable_test(phydev, info->extack); ethnl_ops_complete(dev); if (!ret) ethnl_cable_test_started(phydev, ETHTOOL_MSG_CABLE_TEST_NTF); out_unlock: netdev_unlock_ops(dev); rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; } int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { int err = -ENOMEM; /* One TDR sample occupies 20 bytes. For a 150 meter cable, * with four pairs, around 12K is needed. */ phydev->skb = genlmsg_new(SZ_16K, GFP_KERNEL); if (!phydev->skb) goto out; phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, cmd); if (!phydev->ehdr) { err = -EMSGSIZE; goto out; } err = ethnl_fill_reply_header(phydev->skb, phydev->attached_dev, ETHTOOL_A_CABLE_TEST_NTF_HEADER); if (err) goto out; err = nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS, ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED); if (err) goto out; phydev->nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TEST_NTF_NEST); if (!phydev->nest) { err = -EMSGSIZE; goto out; } return 0; out: nlmsg_free(phydev->skb); phydev->skb = NULL; return err; } EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc); void ethnl_cable_test_free(struct phy_device *phydev) { nlmsg_free(phydev->skb); phydev->skb = NULL; } EXPORT_SYMBOL_GPL(ethnl_cable_test_free); void ethnl_cable_test_finished(struct phy_device *phydev) { nla_nest_end(phydev->skb, phydev->nest); genlmsg_end(phydev->skb, phydev->ehdr); ethnl_multicast(phydev->skb, phydev->attached_dev); } EXPORT_SYMBOL_GPL(ethnl_cable_test_finished); int ethnl_cable_test_result_with_src(struct phy_device *phydev, u8 pair, u8 result, u32 src) { struct nlattr *nest; int ret = -EMSGSIZE; nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_NEST_RESULT); if (!nest) return -EMSGSIZE; if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_PAIR, pair)) goto err; if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_CODE, result)) goto err; if (src != ETHTOOL_A_CABLE_INF_SRC_UNSPEC) { if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_RESULT_SRC, src)) goto err; } nla_nest_end(phydev->skb, nest); return 0; err: nla_nest_cancel(phydev->skb, nest); return ret; } EXPORT_SYMBOL_GPL(ethnl_cable_test_result_with_src); int ethnl_cable_test_fault_length_with_src(struct phy_device *phydev, u8 pair, u32 cm, u32 src) { struct nlattr *nest; int ret = -EMSGSIZE; nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_NEST_FAULT_LENGTH); if (!nest) return -EMSGSIZE; if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, pair)) goto err; if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_CM, cm)) goto err; if (src != ETHTOOL_A_CABLE_INF_SRC_UNSPEC) { if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, src)) goto err; } nla_nest_end(phydev->skb, nest); return 0; err: nla_nest_cancel(phydev->skb, nest); return ret; } EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length_with_src); static const struct nla_policy cable_test_tdr_act_cfg_policy[] = { [ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 }, [ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST] = { .type = NLA_U32 }, [ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP] = { .type = NLA_U32 }, [ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR] = { .type = NLA_U8 }, }; const struct nla_policy ethnl_cable_test_tdr_act_policy[] = { [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), [ETHTOOL_A_CABLE_TEST_TDR_CFG] = { .type = NLA_NESTED }, }; /* CABLE_TEST_TDR_ACT */ static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, struct genl_info *info, struct phy_tdr_config *cfg) { struct nlattr *tb[ARRAY_SIZE(cable_test_tdr_act_cfg_policy)]; int ret; cfg->first = 100; cfg->step = 100; cfg->last = MAX_CABLE_LENGTH_CM; cfg->pair = PHY_PAIR_ALL; if (!nest) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(cable_test_tdr_act_cfg_policy) - 1, nest, cable_test_tdr_act_cfg_policy, info->extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]) cfg->first = nla_get_u32( tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]); if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]) cfg->last = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]); if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]) cfg->step = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]); if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]) { cfg->pair = nla_get_u8(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]); if (cfg->pair > ETHTOOL_A_CABLE_PAIR_D) { NL_SET_ERR_MSG_ATTR( info->extack, tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR], "invalid pair parameter"); return -EINVAL; } } if (cfg->first > MAX_CABLE_LENGTH_CM) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST], "invalid first parameter"); return -EINVAL; } if (cfg->last > MAX_CABLE_LENGTH_CM) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST], "invalid last parameter"); return -EINVAL; } if (cfg->first > cfg->last) { NL_SET_ERR_MSG(info->extack, "invalid first/last parameter"); return -EINVAL; } if (!cfg->step) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], "invalid step parameter"); return -EINVAL; } if (cfg->step > (cfg->last - cfg->first)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], "step parameter too big"); return -EINVAL; } return 0; } int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) { struct ethnl_req_info req_info = {}; const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; struct phy_device *phydev; struct phy_tdr_config cfg; struct net_device *dev; int ret; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; ret = ethnl_act_cable_test_tdr_cfg(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG], info, &cfg); if (ret) goto out_dev_put; rtnl_lock(); netdev_lock_ops(dev); phydev = ethnl_req_get_phydev(&req_info, tb, ETHTOOL_A_CABLE_TEST_TDR_HEADER, info->extack); if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; goto out_unlock; } ops = ethtool_phy_ops; if (!ops || !ops->start_cable_test_tdr) { ret = -EOPNOTSUPP; goto out_unlock; } ret = ethnl_ops_begin(dev); if (ret < 0) goto out_unlock; ret = ops->start_cable_test_tdr(phydev, info->extack, &cfg); ethnl_ops_complete(dev); if (!ret) ethnl_cable_test_started(phydev, ETHTOOL_MSG_CABLE_TEST_TDR_NTF); out_unlock: netdev_unlock_ops(dev); rtnl_unlock(); out_dev_put: ethnl_parse_header_dev_put(&req_info); return ret; } int ethnl_cable_test_amplitude(struct phy_device *phydev, u8 pair, s16 mV) { struct nlattr *nest; int ret = -EMSGSIZE; nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE); if (!nest) return -EMSGSIZE; if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_PAIR, pair)) goto err; if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_mV, mV)) goto err; nla_nest_end(phydev->skb, nest); return 0; err: nla_nest_cancel(phydev->skb, nest); return ret; } EXPORT_SYMBOL_GPL(ethnl_cable_test_amplitude); int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV) { struct nlattr *nest; int ret = -EMSGSIZE; nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_PULSE); if (!nest) return -EMSGSIZE; if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_PULSE_mV, mV)) goto err; nla_nest_end(phydev->skb, nest); return 0; err: nla_nest_cancel(phydev->skb, nest); return ret; } EXPORT_SYMBOL_GPL(ethnl_cable_test_pulse); int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last, u32 step) { struct nlattr *nest; int ret = -EMSGSIZE; nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_STEP); if (!nest) return -EMSGSIZE; if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE, first)) goto err; if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_LAST_DISTANCE, last)) goto err; if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_STEP_DISTANCE, step)) goto err; nla_nest_end(phydev->skb, nest); return 0; err: nla_nest_cancel(phydev->skb, nest); return ret; } EXPORT_SYMBOL_GPL(ethnl_cable_test_step);
825 231 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_PGALLOC_H #define __ASM_GENERIC_PGALLOC_H #ifdef CONFIG_MMU #define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO) #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) /** * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context * * This function is intended for architectures that need * anything beyond simple page allocation. * * Return: pointer to the allocated memory or %NULL on error */ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) { struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); if (!ptdesc) return NULL; return ptdesc_address(ptdesc); } #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL /** * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context * * Return: pointer to the allocated memory or %NULL on error */ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm) { return __pte_alloc_one_kernel_noprof(mm); } #define pte_alloc_one_kernel(...) alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__)) #endif /** * pte_free_kernel - free PTE-level kernel page table memory * @mm: the mm_struct of the current context * @pte: pointer to the memory containing the page table */ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { pagetable_free(virt_to_ptdesc(pte)); } /** * __pte_alloc_one - allocate memory for a PTE-level user page table * @mm: the mm_struct of the current context * @gfp: GFP flags to use for the allocation * * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor(). * * This function is intended for architectures that need * anything beyond simple page allocation or must have custom GFP flags. * * Return: `struct page` referencing the ptdesc or %NULL on error */ static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp) { struct ptdesc *ptdesc; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_page(ptdesc); } #define __pte_alloc_one(...) alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PTE_ALLOC_ONE /** * pte_alloc_one - allocate a page for PTE-level user page table * @mm: the mm_struct of the current context * * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor(). * * Return: `struct page` referencing the ptdesc or %NULL on error */ static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm) { return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER); } #define pte_alloc_one(...) alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__)) #endif /* * Should really implement gc for free page table pages. This could be * done with a reference count in struct page. */ /** * pte_free - free PTE-level user page table memory * @mm: the mm_struct of the current context * @pte_page: the `struct page` referencing the ptdesc */ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); pagetable_dtor_free(ptdesc); } #if CONFIG_PGTABLE_LEVELS > 2 #ifndef __HAVE_ARCH_PMD_ALLOC_ONE /** * pmd_alloc_one - allocate memory for a PMD-level page table * @mm: the mm_struct of the current context * * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor(). * * Allocations use %GFP_PGTABLE_USER in user context and * %GFP_PGTABLE_KERNEL in kernel context. * * Return: pointer to the allocated memory or %NULL on error */ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { struct ptdesc *ptdesc; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pmd_ctor(ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_address(ptdesc); } #define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) #endif #ifndef __HAVE_ARCH_PMD_FREE static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; pagetable_pud_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** * pud_alloc_one - allocate memory for a PUD-level page table * @mm: the mm_struct of the current context * * Allocate memory for a page table using %GFP_PGTABLE_USER for user context * and %GFP_PGTABLE_KERNEL for kernel context. * * Return: pointer to the allocated memory or %NULL on error */ static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { return __pud_alloc_one_noprof(mm, addr); } #define pud_alloc_one(...) alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__)) #endif static inline void __pud_free(struct mm_struct *mm, pud_t *pud) { struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PUD_FREE static inline void pud_free(struct mm_struct *mm, pud_t *pud) { __pud_free(mm, pud); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #if CONFIG_PGTABLE_LEVELS > 4 static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; pagetable_p4d_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_P4D_ALLOC_ONE static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { return __p4d_alloc_one_noprof(mm, addr); } #define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) #endif static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) { struct ptdesc *ptdesc = virt_to_ptdesc(p4d); BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_P4D_FREE static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { if (!mm_p4d_folded(mm)) __p4d_free(mm, p4d); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 4 */ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, order); if (!ptdesc) return NULL; pagetable_pgd_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { __pgd_free(mm, pgd); } #endif #endif /* CONFIG_MMU */ #endif /* __ASM_GENERIC_PGALLOC_H */
4 4 2 6 5 6 1 6 6 6 7 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <rdma/rdma_netlink.h> #include <net/addrconf.h> #include "rxe.h" #include "rxe_loc.h" MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib"); MODULE_DESCRIPTION("Soft RDMA transport"); MODULE_LICENSE("Dual BSD/GPL"); /* free resources for a rxe device all objects created for this device must * have been destroyed */ void rxe_dealloc(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); rxe_pool_cleanup(&rxe->uc_pool); rxe_pool_cleanup(&rxe->pd_pool); rxe_pool_cleanup(&rxe->ah_pool); rxe_pool_cleanup(&rxe->srq_pool); rxe_pool_cleanup(&rxe->qp_pool); rxe_pool_cleanup(&rxe->cq_pool); rxe_pool_cleanup(&rxe->mr_pool); rxe_pool_cleanup(&rxe->mw_pool); WARN_ON(!RB_EMPTY_ROOT(&rxe->mcg_tree)); mutex_destroy(&rxe->usdev_lock); } /* initialize rxe device parameters */ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) { rxe->max_inline_data = RXE_MAX_INLINE_DATA; rxe->attr.vendor_id = RXE_VENDOR_ID; rxe->attr.max_mr_size = RXE_MAX_MR_SIZE; rxe->attr.page_size_cap = RXE_PAGE_SIZE_CAP; rxe->attr.max_qp = RXE_MAX_QP; rxe->attr.max_qp_wr = RXE_MAX_QP_WR; rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS; rxe->attr.kernel_cap_flags = IBK_ALLOW_USER_UNREG; rxe->attr.max_send_sge = RXE_MAX_SGE; rxe->attr.max_recv_sge = RXE_MAX_SGE; rxe->attr.max_sge_rd = RXE_MAX_SGE_RD; rxe->attr.max_cq = RXE_MAX_CQ; rxe->attr.max_cqe = (1 << RXE_MAX_LOG_CQE) - 1; rxe->attr.max_mr = RXE_MAX_MR; rxe->attr.max_mw = RXE_MAX_MW; rxe->attr.max_pd = RXE_MAX_PD; rxe->attr.max_qp_rd_atom = RXE_MAX_QP_RD_ATOM; rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM; rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM; rxe->attr.atomic_cap = IB_ATOMIC_HCA; rxe->attr.max_mcast_grp = RXE_MAX_MCAST_GRP; rxe->attr.max_mcast_qp_attach = RXE_MAX_MCAST_QP_ATTACH; rxe->attr.max_total_mcast_qp_attach = RXE_MAX_TOT_MCAST_QP_ATTACH; rxe->attr.max_ah = RXE_MAX_AH; rxe->attr.max_srq = RXE_MAX_SRQ; rxe->attr.max_srq_wr = RXE_MAX_SRQ_WR; rxe->attr.max_srq_sge = RXE_MAX_SRQ_SGE; rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; rxe->attr.max_pkeys = RXE_MAX_PKEYS; rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; if (ndev->addr_len) { memcpy(rxe->raw_gid, ndev->dev_addr, min_t(unsigned int, ndev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(rxe->raw_gid); } addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, rxe->raw_gid); rxe->max_ucontext = RXE_MAX_UCONTEXT; if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING; /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV; rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; } } /* initialize port attributes */ static void rxe_init_port_param(struct rxe_port *port) { port->attr.state = IB_PORT_DOWN; port->attr.max_mtu = IB_MTU_4096; port->attr.active_mtu = IB_MTU_256; port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN; port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS; port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ; port->attr.bad_pkey_cntr = RXE_PORT_BAD_PKEY_CNTR; port->attr.qkey_viol_cntr = RXE_PORT_QKEY_VIOL_CNTR; port->attr.pkey_tbl_len = RXE_PORT_PKEY_TBL_LEN; port->attr.lid = RXE_PORT_LID; port->attr.sm_lid = RXE_PORT_SM_LID; port->attr.lmc = RXE_PORT_LMC; port->attr.max_vl_num = RXE_PORT_MAX_VL_NUM; port->attr.sm_sl = RXE_PORT_SM_SL; port->attr.subnet_timeout = RXE_PORT_SUBNET_TIMEOUT; port->attr.init_type_reply = RXE_PORT_INIT_TYPE_REPLY; port->attr.active_width = RXE_PORT_ACTIVE_WIDTH; port->attr.active_speed = RXE_PORT_ACTIVE_SPEED; port->attr.phys_state = RXE_PORT_PHYS_STATE; port->mtu_cap = ib_mtu_enum_to_int(IB_MTU_256); port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX); } /* initialize port state, note IB convention that HCA ports are always * numbered from 1 */ static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev) { struct rxe_port *port = &rxe->port; rxe_init_port_param(port); addrconf_addr_eui48((unsigned char *)&port->port_guid, rxe->raw_gid); spin_lock_init(&port->port_lock); } /* init pools of managed objects */ static void rxe_init_pools(struct rxe_dev *rxe) { rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC); rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD); rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH); rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ); rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP); rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ); rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR); rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW); } /* initialize rxe device state */ static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev) { /* init default device parameters */ rxe_init_device_param(rxe, ndev); rxe_init_ports(rxe, ndev); rxe_init_pools(rxe); /* init pending mmap list */ spin_lock_init(&rxe->mmap_offset_lock); spin_lock_init(&rxe->pending_lock); INIT_LIST_HEAD(&rxe->pending_mmaps); /* init multicast support */ spin_lock_init(&rxe->mcg_lock); rxe->mcg_tree = RB_ROOT; mutex_init(&rxe->usdev_lock); } void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) { struct rxe_port *port = &rxe->port; enum ib_mtu mtu; mtu = eth_mtu_int_to_enum(ndev_mtu); /* Make sure that new MTU in range */ mtu = mtu ? min_t(enum ib_mtu, mtu, IB_MTU_4096) : IB_MTU_256; port->attr.active_mtu = mtu; port->mtu_cap = ib_mtu_enum_to_int(mtu); } /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. */ int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, struct net_device *ndev) { rxe_init(rxe, ndev); rxe_set_mtu(rxe, mtu); return rxe_register_device(rxe, ibdev_name, ndev); } static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) { struct rxe_dev *rxe; int err = 0; if (is_vlan_dev(ndev)) { rxe_err("rxe creation allowed on top of a real device only\n"); err = -EPERM; goto err; } rxe = rxe_get_dev_from_net(ndev); if (rxe) { ib_device_put(&rxe->ib_dev); rxe_err_dev(rxe, "already configured on %s\n", ndev->name); err = -EEXIST; goto err; } err = rxe_net_add(ibdev_name, ndev); if (err) { rxe_err("failed to add %s\n", ndev->name); goto err; } err: return err; } static struct rdma_link_ops rxe_link_ops = { .type = "rxe", .newlink = rxe_newlink, }; static int __init rxe_module_init(void) { int err; err = rxe_alloc_wq(); if (err) return err; err = rxe_net_init(); if (err) { rxe_destroy_wq(); return err; } rdma_link_register(&rxe_link_ops); pr_info("loaded\n"); return 0; } static void __exit rxe_module_exit(void) { rdma_link_unregister(&rxe_link_ops); ib_unregister_driver(RDMA_DRIVER_RXE); rxe_net_exit(); rxe_destroy_wq(); pr_info("unloaded\n"); } late_initcall(rxe_module_init); module_exit(rxe_module_exit); MODULE_ALIAS_RDMA_LINK("rxe");
11 5 6 4 1 1 2 5 1 2 2 2 3 1 1 1 1 3 1 2 2 6 1 1 1 1 2 2 4 1 1 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <net/genetlink.h> #include <net/ncsi.h> #include <linux/skbuff.h> #include <net/sock.h> #include <uapi/linux/ncsi.h> #include "internal.h" #include "ncsi-pkt.h" #include "ncsi-netlink.h" static struct genl_family ncsi_genl_family; static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = { [NCSI_ATTR_IFINDEX] = { .type = NLA_U32 }, [NCSI_ATTR_PACKAGE_LIST] = { .type = NLA_NESTED }, [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, [NCSI_ATTR_DATA] = { .type = NLA_BINARY, .len = 2048 }, [NCSI_ATTR_MULTI_FLAG] = { .type = NLA_FLAG }, [NCSI_ATTR_PACKAGE_MASK] = { .type = NLA_U32 }, [NCSI_ATTR_CHANNEL_MASK] = { .type = NLA_U32 }, }; static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) { struct ncsi_dev_priv *ndp; struct net_device *dev; struct ncsi_dev *nd; struct ncsi_dev; if (!net) return NULL; dev = dev_get_by_index(net, ifindex); if (!dev) { pr_err("NCSI netlink: No device for ifindex %u\n", ifindex); return NULL; } nd = ncsi_find_dev(dev); ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; dev_put(dev); return ndp; } static int ncsi_write_channel_info(struct sk_buff *skb, struct ncsi_dev_priv *ndp, struct ncsi_channel *nc) { struct ncsi_channel_vlan_filter *ncf; struct ncsi_channel_mode *m; struct nlattr *vid_nest; int i; nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id); m = &nc->modes[NCSI_MODE_LINK]; nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]); if (nc->state == NCSI_CHANNEL_ACTIVE) nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE); if (nc == nc->package->preferred_channel) nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED); nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.major); nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MINOR, nc->version.minor); nla_put_string(skb, NCSI_CHANNEL_ATTR_VERSION_STR, nc->version.fw_name); vid_nest = nla_nest_start_noflag(skb, NCSI_CHANNEL_ATTR_VLAN_LIST); if (!vid_nest) return -ENOMEM; ncf = &nc->vlan_filter; i = -1; while ((i = find_next_bit((void *)&ncf->bitmap, ncf->n_vids, i + 1)) < ncf->n_vids) { if (ncf->vids[i]) nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID, ncf->vids[i]); } nla_nest_end(skb, vid_nest); return 0; } static int ncsi_write_package_info(struct sk_buff *skb, struct ncsi_dev_priv *ndp, unsigned int id) { struct nlattr *pnest, *cnest, *nest; struct ncsi_package *np; struct ncsi_channel *nc; bool found; int rc; if (id > ndp->package_num - 1) { netdev_info(ndp->ndev.dev, "NCSI: No package with id %u\n", id); return -ENODEV; } found = false; NCSI_FOR_EACH_PACKAGE(ndp, np) { if (np->id != id) continue; pnest = nla_nest_start_noflag(skb, NCSI_PKG_ATTR); if (!pnest) return -ENOMEM; rc = nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id); if (rc) { nla_nest_cancel(skb, pnest); return rc; } if ((0x1 << np->id) == ndp->package_whitelist) nla_put_flag(skb, NCSI_PKG_ATTR_FORCED); cnest = nla_nest_start_noflag(skb, NCSI_PKG_ATTR_CHANNEL_LIST); if (!cnest) { nla_nest_cancel(skb, pnest); return -ENOMEM; } NCSI_FOR_EACH_CHANNEL(np, nc) { nest = nla_nest_start_noflag(skb, NCSI_CHANNEL_ATTR); if (!nest) { nla_nest_cancel(skb, cnest); nla_nest_cancel(skb, pnest); return -ENOMEM; } rc = ncsi_write_channel_info(skb, ndp, nc); if (rc) { nla_nest_cancel(skb, nest); nla_nest_cancel(skb, cnest); nla_nest_cancel(skb, pnest); return rc; } nla_nest_end(skb, nest); } nla_nest_end(skb, cnest); nla_nest_end(skb, pnest); found = true; } if (!found) return -ENODEV; return 0; } static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; unsigned int package_id; struct sk_buff *skb; struct nlattr *attr; void *hdr; int rc; if (!info || !info->attrs) return -EINVAL; if (!info->attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) return -EINVAL; ndp = ndp_from_ifindex(genl_info_net(info), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO); if (!hdr) { kfree_skb(skb); return -EMSGSIZE; } package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); attr = nla_nest_start_noflag(skb, NCSI_ATTR_PACKAGE_LIST); if (!attr) { kfree_skb(skb); return -EMSGSIZE; } rc = ncsi_write_package_info(skb, ndp, package_id); if (rc) { nla_nest_cancel(skb, attr); goto err; } nla_nest_end(skb, attr); genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err: kfree_skb(skb); return rc; } static int ncsi_pkg_info_all_nl(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *attrs[NCSI_ATTR_MAX + 1]; struct ncsi_package *np, *package; struct ncsi_dev_priv *ndp; unsigned int package_id; struct nlattr *attr; void *hdr; int rc; rc = genlmsg_parse_deprecated(cb->nlh, &ncsi_genl_family, attrs, NCSI_ATTR_MAX, ncsi_genl_policy, NULL); if (rc) return rc; if (!attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; ndp = ndp_from_ifindex(get_net(sock_net(skb->sk)), nla_get_u32(attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; package_id = cb->args[0]; package = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) if (np->id == package_id) package = np; if (!package) return 0; /* done */ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ncsi_genl_family, NLM_F_MULTI, NCSI_CMD_PKG_INFO); if (!hdr) { rc = -EMSGSIZE; goto err; } attr = nla_nest_start_noflag(skb, NCSI_ATTR_PACKAGE_LIST); if (!attr) { rc = -EMSGSIZE; goto err; } rc = ncsi_write_package_info(skb, ndp, package->id); if (rc) { nla_nest_cancel(skb, attr); goto err; } nla_nest_end(skb, attr); genlmsg_end(skb, hdr); cb->args[0] = package_id + 1; return skb->len; err: genlmsg_cancel(skb, hdr); return rc; } static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_package *np, *package; struct ncsi_channel *nc, *channel; u32 package_id, channel_id; struct ncsi_dev_priv *ndp; unsigned long flags; if (!info || !info->attrs) return -EINVAL; if (!info->attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) return -EINVAL; ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); package = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) if (np->id == package_id) package = np; if (!package) { /* The user has set a package that does not exist */ return -ERANGE; } channel = NULL; if (info->attrs[NCSI_ATTR_CHANNEL_ID]) { channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); NCSI_FOR_EACH_CHANNEL(package, nc) if (nc->id == channel_id) { channel = nc; break; } if (!channel) { netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n", channel_id); return -ERANGE; } } spin_lock_irqsave(&ndp->lock, flags); ndp->package_whitelist = 0x1 << package->id; ndp->multi_package = false; spin_unlock_irqrestore(&ndp->lock, flags); spin_lock_irqsave(&package->lock, flags); package->multi_channel = false; if (channel) { package->channel_whitelist = 0x1 << channel->id; package->preferred_channel = channel; } else { /* Allow any channel */ package->channel_whitelist = UINT_MAX; package->preferred_channel = NULL; } spin_unlock_irqrestore(&package->lock, flags); if (channel) netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x as preferred\n", package_id, channel_id); else netdev_info(ndp->ndev.dev, "Set package 0x%x as preferred\n", package_id); /* Update channel configuration */ if (!(ndp->flags & NCSI_DEV_RESET)) ncsi_reset_dev(&ndp->ndev); return 0; } static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; struct ncsi_package *np; unsigned long flags; if (!info || !info->attrs) return -EINVAL; if (!info->attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; /* Reset any whitelists and disable multi mode */ spin_lock_irqsave(&ndp->lock, flags); ndp->package_whitelist = UINT_MAX; ndp->multi_package = false; spin_unlock_irqrestore(&ndp->lock, flags); NCSI_FOR_EACH_PACKAGE(ndp, np) { spin_lock_irqsave(&np->lock, flags); np->multi_channel = false; np->channel_whitelist = UINT_MAX; np->preferred_channel = NULL; spin_unlock_irqrestore(&np->lock, flags); } netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n"); /* Update channel configuration */ if (!(ndp->flags & NCSI_DEV_RESET)) ncsi_reset_dev(&ndp->ndev); return 0; } static int ncsi_send_cmd_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; struct ncsi_pkt_hdr *hdr; struct ncsi_cmd_arg nca; unsigned char *data; u32 package_id; u32 channel_id; int len, ret; if (!info || !info->attrs) { ret = -EINVAL; goto out; } if (!info->attrs[NCSI_ATTR_IFINDEX]) { ret = -EINVAL; goto out; } if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) { ret = -EINVAL; goto out; } if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) { ret = -EINVAL; goto out; } if (!info->attrs[NCSI_ATTR_DATA]) { ret = -EINVAL; goto out; } ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) { ret = -ENODEV; goto out; } package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); if (package_id >= NCSI_MAX_PACKAGE || channel_id >= NCSI_MAX_CHANNEL) { ret = -ERANGE; goto out_netlink; } len = nla_len(info->attrs[NCSI_ATTR_DATA]); if (len < sizeof(struct ncsi_pkt_hdr)) { netdev_info(ndp->ndev.dev, "NCSI: no command to send %u\n", package_id); ret = -EINVAL; goto out_netlink; } else { data = (unsigned char *)nla_data(info->attrs[NCSI_ATTR_DATA]); } hdr = (struct ncsi_pkt_hdr *)data; nca.ndp = ndp; nca.package = (unsigned char)package_id; nca.channel = (unsigned char)channel_id; nca.type = hdr->type; nca.req_flags = NCSI_REQ_FLAG_NETLINK_DRIVEN; nca.info = info; nca.payload = ntohs(hdr->length); nca.data = data + sizeof(*hdr); ret = ncsi_xmit_cmd(&nca); out_netlink: if (ret != 0) { netdev_err(ndp->ndev.dev, "NCSI: Error %d sending command\n", ret); ncsi_send_netlink_err(ndp->ndev.dev, info->snd_seq, info->snd_portid, info->nlhdr, ret); } out: return ret; } int ncsi_send_netlink_rsp(struct ncsi_request *nr, struct ncsi_package *np, struct ncsi_channel *nc) { struct sk_buff *skb; struct net *net; void *hdr; int rc; net = dev_net(nr->rsp->dev); skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq, &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD); if (!hdr) { kfree_skb(skb); return -EMSGSIZE; } nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->rsp->dev->ifindex); if (np) nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id); if (nc) nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id); else nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL); rc = nla_put(skb, NCSI_ATTR_DATA, nr->rsp->len, (void *)nr->rsp->data); if (rc) goto err; genlmsg_end(skb, hdr); return genlmsg_unicast(net, skb, nr->snd_portid); err: kfree_skb(skb); return rc; } int ncsi_send_netlink_timeout(struct ncsi_request *nr, struct ncsi_package *np, struct ncsi_channel *nc) { struct sk_buff *skb; struct net *net; void *hdr; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq, &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD); if (!hdr) { kfree_skb(skb); return -EMSGSIZE; } net = dev_net(nr->cmd->dev); nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->cmd->dev->ifindex); if (np) nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id); else nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, NCSI_PACKAGE_INDEX((((struct ncsi_pkt_hdr *) nr->cmd->data)->channel))); if (nc) nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id); else nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL); genlmsg_end(skb, hdr); return genlmsg_unicast(net, skb, nr->snd_portid); } int ncsi_send_netlink_err(struct net_device *dev, u32 snd_seq, u32 snd_portid, const struct nlmsghdr *nlhdr, int err) { struct nlmsghdr *nlh; struct nlmsgerr *nle; struct sk_buff *skb; struct net *net; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; net = dev_net(dev); nlh = nlmsg_put(skb, snd_portid, snd_seq, NLMSG_ERROR, sizeof(*nle), 0); nle = (struct nlmsgerr *)nlmsg_data(nlh); nle->error = err; memcpy(&nle->msg, nlhdr, sizeof(*nlh)); nlmsg_end(skb, nlh); return nlmsg_unicast(net->genl_sock, skb, snd_portid); } static int ncsi_set_package_mask_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; unsigned long flags; int rc; if (!info || !info->attrs) return -EINVAL; if (!info->attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; if (!info->attrs[NCSI_ATTR_PACKAGE_MASK]) return -EINVAL; ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; spin_lock_irqsave(&ndp->lock, flags); if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) { if (ndp->flags & NCSI_DEV_HWA) { ndp->multi_package = true; rc = 0; } else { netdev_err(ndp->ndev.dev, "NCSI: Can't use multiple packages without HWA\n"); rc = -EPERM; } } else { ndp->multi_package = false; rc = 0; } if (!rc) ndp->package_whitelist = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_MASK]); spin_unlock_irqrestore(&ndp->lock, flags); if (!rc) { /* Update channel configuration */ if (!(ndp->flags & NCSI_DEV_RESET)) ncsi_reset_dev(&ndp->ndev); } return rc; } static int ncsi_set_channel_mask_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_package *np, *package; struct ncsi_channel *nc, *channel; u32 package_id, channel_id; struct ncsi_dev_priv *ndp; unsigned long flags; if (!info || !info->attrs) return -EINVAL; if (!info->attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) return -EINVAL; if (!info->attrs[NCSI_ATTR_CHANNEL_MASK]) return -EINVAL; ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); package = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) if (np->id == package_id) { package = np; break; } if (!package) return -ERANGE; spin_lock_irqsave(&package->lock, flags); channel = NULL; if (info->attrs[NCSI_ATTR_CHANNEL_ID]) { channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); NCSI_FOR_EACH_CHANNEL(np, nc) if (nc->id == channel_id) { channel = nc; break; } if (!channel) { spin_unlock_irqrestore(&package->lock, flags); return -ERANGE; } netdev_dbg(ndp->ndev.dev, "NCSI: Channel %u set as preferred channel\n", channel->id); } package->channel_whitelist = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_MASK]); if (package->channel_whitelist == 0) netdev_dbg(ndp->ndev.dev, "NCSI: Package %u set to all channels disabled\n", package->id); package->preferred_channel = channel; if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) { package->multi_channel = true; netdev_info(ndp->ndev.dev, "NCSI: Multi-channel enabled on package %u\n", package_id); } else { package->multi_channel = false; } spin_unlock_irqrestore(&package->lock, flags); /* Update channel configuration */ if (!(ndp->flags & NCSI_DEV_RESET)) ncsi_reset_dev(&ndp->ndev); return 0; } static const struct genl_small_ops ncsi_ops[] = { { .cmd = NCSI_CMD_PKG_INFO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_pkg_info_nl, .dumpit = ncsi_pkg_info_all_nl, .flags = 0, }, { .cmd = NCSI_CMD_SET_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_set_interface_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_CLEAR_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_clear_interface_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_SEND_CMD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_send_cmd_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_SET_PACKAGE_MASK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_set_package_mask_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_SET_CHANNEL_MASK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_set_channel_mask_nl, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family ncsi_genl_family __ro_after_init = { .name = "NCSI", .version = 0, .maxattr = NCSI_ATTR_MAX, .policy = ncsi_genl_policy, .module = THIS_MODULE, .small_ops = ncsi_ops, .n_small_ops = ARRAY_SIZE(ncsi_ops), .resv_start_op = NCSI_CMD_SET_CHANNEL_MASK + 1, }; static int __init ncsi_init_netlink(void) { return genl_register_family(&ncsi_genl_family); } subsys_initcall(ncsi_init_netlink);
1750 34940 787 1748 1749 34984 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 /* SPDX-License-Identifier: GPL-2.0 */ /* * Latched RB-trees * * Copyright (C) 2015 Intel Corp., Peter Zijlstra <peterz@infradead.org> * * Since RB-trees have non-atomic modifications they're not immediately suited * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for * lockless lookups; we cannot guarantee they return a correct result. * * The simplest solution is a seqlock + RB-tree, this will allow lockless * lookups; but has the constraint (inherent to the seqlock) that read sides * cannot nest in write sides. * * If we need to allow unconditional lookups (say as required for NMI context * usage) we need a more complex setup; this data structure provides this by * employing the latch technique -- see @write_seqcount_latch_begin -- to * implement a latched RB-tree which does allow for unconditional lookups by * virtue of always having (at least) one stable copy of the tree. * * However, while we have the guarantee that there is at all times one stable * copy, this does not guarantee an iteration will not observe modifications. * What might have been a stable copy at the start of the iteration, need not * remain so for the duration of the iteration. * * Therefore, this does require a lockless RB-tree iteration to be non-fatal; * see the comment in lib/rbtree.c. Note however that we only require the first * condition -- not seeing partial stores -- because the latch thing isolates * us from loops. If we were to interrupt a modification the lookup would be * pointed at the stable tree and complete while the modification was halted. */ #ifndef RB_TREE_LATCH_H #define RB_TREE_LATCH_H #include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/rcupdate.h> struct latch_tree_node { struct rb_node node[2]; }; struct latch_tree_root { seqcount_latch_t seq; struct rb_root tree[2]; }; /** * latch_tree_ops - operators to define the tree order * @less: used for insertion; provides the (partial) order between two elements. * @comp: used for lookups; provides the order between the search key and an element. * * The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * latch_tree_find(). */ struct latch_tree_ops { bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b); int (*comp)(void *key, struct latch_tree_node *b); }; static __always_inline struct latch_tree_node * __lt_from_rb(struct rb_node *node, int idx) { return container_of(node, struct latch_tree_node, node[idx]); } static __always_inline void __lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx, bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b)) { struct rb_root *root = &ltr->tree[idx]; struct rb_node **link = &root->rb_node; struct rb_node *node = &ltn->node[idx]; struct rb_node *parent = NULL; struct latch_tree_node *ltp; while (*link) { parent = *link; ltp = __lt_from_rb(parent, idx); if (less(ltn, ltp)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node_rcu(node, parent, link); rb_insert_color(node, root); } static __always_inline void __lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx) { rb_erase(&ltn->node[idx], &ltr->tree[idx]); } static __always_inline struct latch_tree_node * __lt_find(void *key, struct latch_tree_root *ltr, int idx, int (*comp)(void *key, struct latch_tree_node *node)) { struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node); struct latch_tree_node *ltn; int c; while (node) { ltn = __lt_from_rb(node, idx); c = comp(key, ltn); if (c < 0) node = rcu_dereference_raw(node->rb_left); else if (c > 0) node = rcu_dereference_raw(node->rb_right); else return ltn; } return NULL; } /** * latch_tree_insert() - insert @node into the trees @root * @node: nodes to insert * @root: trees to insert @node into * @ops: operators defining the node order * * It inserts @node into @root in an ordered fashion such that we can always * observe one complete tree. See the comment for write_seqcount_latch_begin(). * * The inserts use rcu_assign_pointer() to publish the element such that the * tree structure is stored before we can observe the new @node. * * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be * serialized. */ static __always_inline void latch_tree_insert(struct latch_tree_node *node, struct latch_tree_root *root, const struct latch_tree_ops *ops) { write_seqcount_latch_begin(&root->seq); __lt_insert(node, root, 0, ops->less); write_seqcount_latch(&root->seq); __lt_insert(node, root, 1, ops->less); write_seqcount_latch_end(&root->seq); } /** * latch_tree_erase() - removes @node from the trees @root * @node: nodes to remote * @root: trees to remove @node from * @ops: operators defining the node order * * Removes @node from the trees @root in an ordered fashion such that we can * always observe one complete tree. See the comment for * write_seqcount_latch_begin(). * * It is assumed that @node will observe one RCU quiescent state before being * reused of freed. * * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be * serialized. */ static __always_inline void latch_tree_erase(struct latch_tree_node *node, struct latch_tree_root *root, const struct latch_tree_ops *ops) { write_seqcount_latch_begin(&root->seq); __lt_erase(node, root, 0); write_seqcount_latch(&root->seq); __lt_erase(node, root, 1); write_seqcount_latch_end(&root->seq); } /** * latch_tree_find() - find the node matching @key in the trees @root * @key: search key * @root: trees to search for @key * @ops: operators defining the node order * * Does a lockless lookup in the trees @root for the node matching @key. * * It is assumed that this is called while holding the appropriate RCU read * side lock. * * If the operators define a partial order on the elements (there are multiple * elements which have the same key value) it is undefined which of these * elements will be found. Nor is it possible to iterate the tree to find * further elements with the same key value. * * Returns: a pointer to the node matching @key or NULL. */ static __always_inline struct latch_tree_node * latch_tree_find(void *key, struct latch_tree_root *root, const struct latch_tree_ops *ops) { struct latch_tree_node *node; unsigned int seq; do { seq = read_seqcount_latch(&root->seq); node = __lt_find(key, root, seq & 1, ops->comp); } while (read_seqcount_latch_retry(&root->seq, seq)); return node; } #endif /* RB_TREE_LATCH_H */
1259 1257 1258 1257 1259 452 916 1258 4278 4278 4276 4273 4292 2356 3097 4283 18133 18137 18122 18209 18136 5120 8621 1257 6729 8825 4475 6729 6729 6723 2850 4279 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2021, Google LLC. * Pasha Tatashin <pasha.tatashin@soleen.com> */ #include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/page_table_check.h> #include <linux/swap.h> #include <linux/swapops.h> #undef pr_fmt #define pr_fmt(fmt) "page_table_check: " fmt struct page_table_check { atomic_t anon_map_count; atomic_t file_map_count; }; static bool __page_table_check_enabled __initdata = IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED); DEFINE_STATIC_KEY_TRUE(page_table_check_disabled); EXPORT_SYMBOL(page_table_check_disabled); static int __init early_page_table_check_param(char *buf) { return kstrtobool(buf, &__page_table_check_enabled); } early_param("page_table_check", early_page_table_check_param); static bool __init need_page_table_check(void) { return __page_table_check_enabled; } static void __init init_page_table_check(void) { if (!__page_table_check_enabled) return; static_branch_disable(&page_table_check_disabled); } struct page_ext_operations page_table_check_ops = { .size = sizeof(struct page_table_check), .need = need_page_table_check, .init = init_page_table_check, .need_shared_flags = false, }; static struct page_table_check *get_page_table_check(struct page_ext *page_ext) { BUG_ON(!page_ext); return page_ext_data(page_ext, &page_table_check_ops); } /* * An entry is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. */ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) { struct page_ext_iter iter; struct page_ext *page_ext; struct page *page; bool anon; if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); BUG_ON(PageSlab(page)); anon = PageAnon(page); rcu_read_lock(); for_each_page_ext(page, pgcnt, page_ext, iter) { struct page_table_check *ptc = get_page_table_check(page_ext); if (anon) { BUG_ON(atomic_read(&ptc->file_map_count)); BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0); } else { BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); } } rcu_read_unlock(); } /* * A new entry is added to the page table, increment the counters for that page * verify that it is of correct type and is not being mapped with a different * type to a different process. */ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, bool rw) { struct page_ext_iter iter; struct page_ext *page_ext; struct page *page; bool anon; if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); BUG_ON(PageSlab(page)); anon = PageAnon(page); rcu_read_lock(); for_each_page_ext(page, pgcnt, page_ext, iter) { struct page_table_check *ptc = get_page_table_check(page_ext); if (anon) { BUG_ON(atomic_read(&ptc->file_map_count)); BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw); } else { BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); } } rcu_read_unlock(); } /* * page is on free list, or is being allocated, verify that counters are zeroes * crash if they are not. */ void __page_table_check_zero(struct page *page, unsigned int order) { struct page_ext_iter iter; struct page_ext *page_ext; BUG_ON(PageSlab(page)); rcu_read_lock(); for_each_page_ext(page, 1 << order, page_ext, iter) { struct page_table_check *ptc = get_page_table_check(page_ext); BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_read(&ptc->file_map_count)); } rcu_read_unlock(); } void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (&init_mm == mm) return; if (pte_user_accessible_page(pte)) { page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pte_clear); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (&init_mm == mm) return; if (pmd_user_accessible_page(pmd)) { page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pmd_clear); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (&init_mm == mm) return; if (pud_user_accessible_page(pud)) { page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pud_clear); /* Whether the swap entry cached writable information */ static inline bool swap_cached_writable(swp_entry_t entry) { return is_writable_device_private_entry(entry) || is_writable_migration_entry(entry); } static inline void page_table_check_pte_flags(pte_t pte) { if (pte_present(pte) && pte_uffd_wp(pte)) WARN_ON_ONCE(pte_write(pte)); else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte)) WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte))); } void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { unsigned int i; if (&init_mm == mm) return; page_table_check_pte_flags(pte); for (i = 0; i < nr; i++) __page_table_check_pte_clear(mm, ptep_get(ptep + i)); if (pte_user_accessible_page(pte)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } EXPORT_SYMBOL(__page_table_check_ptes_set); static inline void page_table_check_pmd_flags(pmd_t pmd) { if (pmd_present(pmd) && pmd_uffd_wp(pmd)) WARN_ON_ONCE(pmd_write(pmd)); else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd)) WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); } void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (&init_mm == mm) return; page_table_check_pmd_flags(pmd); __page_table_check_pmd_clear(mm, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, pmd_write(pmd)); } } EXPORT_SYMBOL(__page_table_check_pmd_set); void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (&init_mm == mm) return; __page_table_check_pud_clear(mm, *pudp); if (pud_user_accessible_page(pud)) { page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT, pud_write(pud)); } } EXPORT_SYMBOL(__page_table_check_pud_set); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd) { if (&init_mm == mm) return; if (!pmd_bad(pmd) && !pmd_leaf(pmd)) { pte_t *ptep = pte_offset_map(&pmd, addr); unsigned long i; if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { __page_table_check_pte_clear(mm, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } pte_unmap(ptep - PTRS_PER_PTE); } }
260 102 140 140 140 140 140 140 140 25 1059 913 147 915 914 915 1917 1921 17 260 474 214 259 260 261 259 213 214 213 144 132 132 132 12 2 2 12 12 120 119 2 1 1 129 129 129 128 129 141 111 3 111 201 201 17 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 // SPDX-License-Identifier: GPL-2.0-only /* net/core/xdp.c * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/idr.h> #include <linux/rhashtable.h> #include <linux/bug.h> #include <net/page_pool/helpers.h> #include <net/hotdata.h> #include <net/netdev_lock.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> #include <net/xdp_sock_drv.h> #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 #define REG_STATE_UNREGISTERED 0x2 #define REG_STATE_UNUSED 0x3 static DEFINE_IDA(mem_id_pool); static DEFINE_MUTEX(mem_id_lock); #define MEM_ID_MAX 0xFFFE #define MEM_ID_MIN 1 static int mem_id_next = MEM_ID_MIN; static bool mem_id_init; /* false */ static struct rhashtable *mem_id_ht; static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) { const u32 *k = data; const u32 key = *k; BUILD_BUG_ON(sizeof_field(struct xdp_mem_allocator, mem.id) != sizeof(u32)); /* Use cyclic increasing ID as direct hash key */ return key; } static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct xdp_mem_allocator *xa = ptr; u32 mem_id = *(u32 *)arg->key; return xa->mem.id != mem_id; } static const struct rhashtable_params mem_id_rht_params = { .nelem_hint = 64, .head_offset = offsetof(struct xdp_mem_allocator, node), .key_offset = offsetof(struct xdp_mem_allocator, mem.id), .key_len = sizeof_field(struct xdp_mem_allocator, mem.id), .max_size = MEM_ID_MAX, .min_size = 8, .automatic_shrinking = true, .hashfn = xdp_mem_id_hashfn, .obj_cmpfn = xdp_mem_id_cmp, }; static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) { struct xdp_mem_allocator *xa; xa = container_of(rcu, struct xdp_mem_allocator, rcu); /* Allow this ID to be reused */ ida_free(&mem_id_pool, xa->mem.id); kfree(xa); } static void mem_xa_remove(struct xdp_mem_allocator *xa) { trace_mem_disconnect(xa); if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); } static void mem_allocator_disconnect(void *allocator) { struct xdp_mem_allocator *xa; struct rhashtable_iter iter; mutex_lock(&mem_id_lock); rhashtable_walk_enter(mem_id_ht, &iter); do { rhashtable_walk_start(&iter); while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) { if (xa->allocator == allocator) mem_xa_remove(xa); } rhashtable_walk_stop(&iter); } while (xa == ERR_PTR(-EAGAIN)); rhashtable_walk_exit(&iter); mutex_unlock(&mem_id_lock); } void xdp_unreg_mem_model(struct xdp_mem_info *mem) { struct xdp_mem_allocator *xa; int type = mem->type; int id = mem->id; /* Reset mem info to defaults */ mem->id = 0; mem->type = 0; if (id == 0) return; if (type == MEM_TYPE_PAGE_POOL) { xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); page_pool_destroy(xa->page_pool); } } EXPORT_SYMBOL_GPL(xdp_unreg_mem_model); void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { WARN(1, "Missing register, driver bug"); return; } xdp_unreg_mem_model(&xdp_rxq->mem); } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ if (xdp_rxq->reg_state == REG_STATE_UNUSED) return; xdp_rxq_info_unreg_mem_model(xdp_rxq); xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) { memset(xdp_rxq, 0, sizeof(*xdp_rxq)); } /* Returns 0 on success, negative on failure */ int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id, u32 frag_size) { if (!dev) { WARN(1, "Missing net_device from driver"); return -ENODEV; } if (xdp_rxq->reg_state == REG_STATE_UNUSED) { WARN(1, "Driver promised not to register this"); return -EINVAL; } if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { WARN(1, "Missing unregister, handled but fix driver"); xdp_rxq_info_unreg(xdp_rxq); } /* State either UNREGISTERED or NEW */ xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; xdp_rxq->frag_size = frag_size; xdp_rxq->reg_state = REG_STATE_REGISTERED; return 0; } EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) { xdp_rxq->reg_state = REG_STATE_UNUSED; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) { return (xdp_rxq->reg_state == REG_STATE_REGISTERED); } EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); static int __mem_id_init_hash_table(void) { struct rhashtable *rht; int ret; if (unlikely(mem_id_init)) return 0; rht = kzalloc(sizeof(*rht), GFP_KERNEL); if (!rht) return -ENOMEM; ret = rhashtable_init(rht, &mem_id_rht_params); if (ret < 0) { kfree(rht); return ret; } mem_id_ht = rht; smp_mb(); /* mutex lock should provide enough pairing */ mem_id_init = true; return 0; } /* Allocate a cyclic ID that maps to allocator pointer. * See: https://www.kernel.org/doc/html/latest/core-api/idr.html * * Caller must lock mem_id_lock. */ static int __mem_id_cyclic_get(gfp_t gfp) { int retries = 1; int id; again: id = ida_alloc_range(&mem_id_pool, mem_id_next, MEM_ID_MAX - 1, gfp); if (id < 0) { if (id == -ENOSPC) { /* Cyclic allocator, reset next id */ if (retries--) { mem_id_next = MEM_ID_MIN; goto again; } } return id; /* errno */ } mem_id_next = id + 1; return id; } static bool __is_supported_mem_type(enum xdp_mem_type type) { if (type == MEM_TYPE_PAGE_POOL) return is_page_pool_compiled_in(); if (type >= MEM_TYPE_MAX) return false; return true; } static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator) { struct xdp_mem_allocator *xdp_alloc; gfp_t gfp = GFP_KERNEL; int id, errno, ret; void *ptr; if (!__is_supported_mem_type(type)) return ERR_PTR(-EOPNOTSUPP); mem->type = type; if (!allocator) { if (type == MEM_TYPE_PAGE_POOL) return ERR_PTR(-EINVAL); /* Setup time check page_pool req */ return NULL; } /* Delay init of rhashtable to save memory if feature isn't used */ if (!mem_id_init) { mutex_lock(&mem_id_lock); ret = __mem_id_init_hash_table(); mutex_unlock(&mem_id_lock); if (ret < 0) return ERR_PTR(ret); } xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); if (!xdp_alloc) return ERR_PTR(-ENOMEM); mutex_lock(&mem_id_lock); id = __mem_id_cyclic_get(gfp); if (id < 0) { errno = id; goto err; } mem->id = id; xdp_alloc->mem = *mem; xdp_alloc->allocator = allocator; /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { ida_free(&mem_id_pool, mem->id); mem->id = 0; errno = PTR_ERR(ptr); goto err; } if (type == MEM_TYPE_PAGE_POOL) page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem); mutex_unlock(&mem_id_lock); return xdp_alloc; err: mutex_unlock(&mem_id_lock); kfree(xdp_alloc); return ERR_PTR(errno); } int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator) { struct xdp_mem_allocator *xdp_alloc; xdp_alloc = __xdp_reg_mem_model(mem, type, allocator); if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); return 0; } EXPORT_SYMBOL_GPL(xdp_reg_mem_model); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator) { struct xdp_mem_allocator *xdp_alloc; if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { WARN(1, "Missing register, driver bug"); return -EFAULT; } xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator); if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); if (type == MEM_TYPE_XSK_BUFF_POOL && allocator) xsk_pool_set_rxq_info(allocator, xdp_rxq); if (trace_mem_connect_enabled() && xdp_alloc) trace_mem_connect(xdp_alloc, xdp_rxq); return 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); /** * xdp_reg_page_pool - register &page_pool as a memory provider for XDP * @pool: &page_pool to register * * Can be used to register pools manually without connecting to any XDP RxQ * info, so that the XDP layer will be aware of them. Then, they can be * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool(). * * Return: %0 on success, -errno on error. */ int xdp_reg_page_pool(struct page_pool *pool) { struct xdp_mem_info mem; return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool); } EXPORT_SYMBOL_GPL(xdp_reg_page_pool); /** * xdp_unreg_page_pool - unregister &page_pool from the memory providers list * @pool: &page_pool to unregister * * A shorthand for manual unregistering page pools. If the pool was previously * attached to an RxQ info, it must be detached first. */ void xdp_unreg_page_pool(const struct page_pool *pool) { struct xdp_mem_info mem = { .type = MEM_TYPE_PAGE_POOL, .id = pool->xdp_mem_id, }; xdp_unreg_mem_model(&mem); } EXPORT_SYMBOL_GPL(xdp_unreg_page_pool); /** * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info * @xdp_rxq: XDP RxQ info to attach the pool to * @pool: pool to attach * * If the pool was registered manually, this function must be called instead * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info. */ void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, const struct page_pool *pool) { struct xdp_mem_info mem = { .type = MEM_TYPE_PAGE_POOL, .id = pool->xdp_mem_id, }; xdp_rxq_info_attach_mem_model(xdp_rxq, &mem); } EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, bool napi_direct, struct xdp_buff *xdp) { switch (mem_type) { case MEM_TYPE_PAGE_POOL: netmem = netmem_compound_head(netmem); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) * as mem->type knows this a page_pool page */ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi_direct); break; case MEM_TYPE_PAGE_SHARED: page_frag_free(__netmem_address(netmem)); break; case MEM_TYPE_PAGE_ORDER0: put_page(__netmem_to_page(netmem)); break; case MEM_TYPE_XSK_BUFF_POOL: /* NB! Only valid from an xdp_buff! */ xsk_buff_free(xdp); break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ WARN(1, "Incorrect XDP memory type (%d) usage", mem_type); break; } } void xdp_return_frame(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); for (u32 i = 0; i < sinfo->nr_frags; i++) __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, false, NULL); out: __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); for (u32 i = 0; i < sinfo->nr_frags; i++) __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, true, NULL); out: __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); /* XDP bulk APIs introduce a defer/flush mechanism to return * pages belonging to the same xdp_mem_allocator object * (identified via the mem.id field) in bulk to optimize * I-cache and D-cache. * The bulk queue size is set to 16 to be aligned to how * XDP_REDIRECT bulking works. The bulk is flushed when * it is full or when mem.id changes. * xdp_frame_bulk is usually stored/allocated on the function * call-stack to avoid locking penalties. */ /* Must be called with rcu_read_lock held */ void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq) { if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) { xdp_return_frame(xdpf); return; } if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); if (unlikely(xdp_frame_has_frags(xdpf))) { struct skb_shared_info *sinfo; int i; sinfo = xdp_get_shared_info_from_frame(xdpf); for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; bq->q[bq->count++] = skb_frag_netmem(frag); if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); } } bq->q[bq->count++] = virt_to_netmem(xdpf->data); } EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); /** * xdp_return_frag -- free one XDP frag or decrement its refcount * @netmem: network memory reference to release * @xdp: &xdp_buff to release the frag for */ void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp) { __xdp_return(netmem, xdp->rxq->mem.type, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frag); void xdp_return_buff(struct xdp_buff *xdp) { struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); for (u32 i = 0; i < sinfo->nr_frags; i++) __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdp->rxq->mem.type, true, xdp); out: __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { if (info->prog) bpf_prog_put(info->prog); info->prog = bpf->prog; info->flags = bpf->flags; } EXPORT_SYMBOL_GPL(xdp_attachment_setup); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) { unsigned int metasize, totsize; void *addr, *data_to_copy; struct xdp_frame *xdpf; struct page *page; /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */ metasize = xdp_data_meta_unsupported(xdp) ? 0 : xdp->data - xdp->data_meta; totsize = xdp->data_end - xdp->data + metasize; if (sizeof(*xdpf) + totsize > PAGE_SIZE) return NULL; page = dev_alloc_page(); if (!page) return NULL; addr = page_to_virt(page); xdpf = addr; memset(xdpf, 0, sizeof(*xdpf)); addr += sizeof(*xdpf); data_to_copy = metasize ? xdp->data_meta : xdp->data; memcpy(addr, data_to_copy, totsize); xdpf->data = addr + metasize; xdpf->len = totsize - metasize; xdpf->headroom = 0; xdpf->metasize = metasize; xdpf->frame_sz = PAGE_SIZE; xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; xsk_buff_free(xdp); return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); /* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */ void xdp_warn(const char *msg, const char *func, const int line) { WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); }; EXPORT_SYMBOL_GPL(xdp_warn); /** * xdp_build_skb_from_buff - create an skb from &xdp_buff * @xdp: &xdp_buff to convert to an skb * * Perform common operations to create a new skb to pass up the stack from * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize * skb data pointers and offsets, set the recycle bit if the buff is * PP-backed, Rx queue index, protocol and update frags info. * * Return: new &sk_buff on success, %NULL on error. */ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) { const struct xdp_rxq_info *rxq = xdp->rxq; const struct skb_shared_info *sinfo; struct sk_buff *skb; u32 nr_frags = 0; int metalen; if (unlikely(xdp_buff_has_frags(xdp))) { sinfo = xdp_get_shared_info_from_buff(xdp); nr_frags = sinfo->nr_frags; } skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz); if (unlikely(!skb)) return NULL; skb_reserve(skb, xdp->data - xdp->data_hard_start); __skb_put(skb, xdp->data_end - xdp->data); metalen = xdp->data - xdp->data_meta; if (metalen > 0) skb_metadata_set(skb, metalen); if (rxq->mem.type == MEM_TYPE_PAGE_POOL) skb_mark_for_recycle(skb); skb_record_rx_queue(skb, rxq->queue_index); if (unlikely(nr_frags)) { u32 tsize; tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; xdp_update_skb_shared_info(skb, nr_frags, sinfo->xdp_frags_size, tsize, xdp_buff_is_frag_pfmemalloc(xdp)); } skb->protocol = eth_type_trans(skb, rxq->dev); return skb; } EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff); /** * xdp_copy_frags_from_zc - copy frags from XSk buff to skb * @skb: skb to copy frags to * @xdp: XSk &xdp_buff from which the frags will be copied * @pp: &page_pool backing page allocation, if available * * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack. * Allocate a new buffer for each frag, copy it and attach to the skb. * * Return: true on success, false on netmem allocation fail. */ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, const struct xdp_buff *xdp, struct page_pool *pp) { struct skb_shared_info *sinfo = skb_shinfo(skb); const struct skb_shared_info *xinfo; u32 nr_frags, tsize = 0; bool pfmemalloc = false; xinfo = xdp_get_shared_info_from_buff(xdp); nr_frags = xinfo->nr_frags; for (u32 i = 0; i < nr_frags; i++) { u32 len = skb_frag_size(&xinfo->frags[i]); u32 offset, truesize = len; netmem_ref netmem; netmem = page_pool_dev_alloc_netmem(pp, &offset, &truesize); if (unlikely(!netmem)) { sinfo->nr_frags = i; return false; } memcpy(__netmem_address(netmem), __netmem_address(xinfo->frags[i].netmem), LARGEST_ALIGN(len)); __skb_fill_netmem_desc_noacc(sinfo, i, netmem, offset, len); tsize += truesize; pfmemalloc |= netmem_is_pfmemalloc(netmem); } xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, tsize, pfmemalloc); return true; } /** * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff * @xdp: source XSk buff * * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb * head, new buffer for the head, copy the data and initialize the skb fields. * If there are frags, allocate new buffers for them and copy. * Buffers are allocated from the system percpu pools to try recycling them. * If new skb was built successfully, @xdp is returned to XSk pool's freelist. * On error, it remains untouched and the caller must take care of this. * * Return: new &sk_buff on success, %NULL on error. */ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) { struct page_pool *pp = this_cpu_read(system_page_pool); const struct xdp_rxq_info *rxq = xdp->rxq; u32 len = xdp->data_end - xdp->data_meta; u32 truesize = xdp->frame_sz; struct sk_buff *skb; int metalen; void *data; if (!IS_ENABLED(CONFIG_PAGE_POOL)) return NULL; data = page_pool_dev_alloc_va(pp, &truesize); if (unlikely(!data)) return NULL; skb = napi_build_skb(data, truesize); if (unlikely(!skb)) { page_pool_free_va(pp, data, true); return NULL; } skb_mark_for_recycle(skb); skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len)); metalen = xdp->data - xdp->data_meta; if (metalen > 0) { skb_metadata_set(skb, metalen); __skb_pull(skb, metalen); } skb_record_rx_queue(skb, rxq->queue_index); if (unlikely(xdp_buff_has_frags(xdp)) && unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { napi_consume_skb(skb, true); return NULL; } xsk_buff_free(xdp); skb->protocol = eth_type_trans(skb, rxq->dev); return skb; } EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf); unsigned int headroom, frame_size; void *hard_start; u8 nr_frags; /* xdp frags frame */ if (unlikely(xdp_frame_has_frags(xdpf))) nr_frags = sinfo->nr_frags; /* Part of headroom was reserved to xdpf */ headroom = sizeof(*xdpf) + xdpf->headroom; /* Memory size backing xdp_frame data already have reserved * room for build_skb to place skb_shared_info in tailroom. */ frame_size = xdpf->frame_sz; hard_start = xdpf->data - headroom; skb = build_skb_around(skb, hard_start, frame_size); if (unlikely(!skb)) return NULL; skb_reserve(skb, headroom); __skb_put(skb, xdpf->len); if (xdpf->metasize) skb_metadata_set(skb, xdpf->metasize); if (unlikely(xdp_frame_has_frags(xdpf))) xdp_update_skb_shared_info(skb, nr_frags, sinfo->xdp_frags_size, nr_frags * xdpf->frame_sz, xdp_frame_is_frag_pfmemalloc(xdpf)); /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); /* Optional SKB info, currently missing: * - HW checksum info (skb->ip_summed) * - HW RX hash (skb_set_hash) * - RX ring dev queue index (skb_record_rx_queue) */ if (xdpf->mem_type == MEM_TYPE_PAGE_POOL) skb_mark_for_recycle(skb); /* Allow SKB to reuse area used by xdp_frame */ xdp_scrub_frame(xdpf); return skb; } EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev) { struct sk_buff *skb; skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); return __xdp_build_skb_from_frame(xdpf, skb, dev); } EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) { unsigned int headroom, totalsize; struct xdp_frame *nxdpf; struct page *page; void *addr; headroom = xdpf->headroom + sizeof(*xdpf); totalsize = headroom + xdpf->len; if (unlikely(totalsize > PAGE_SIZE)) return NULL; page = dev_alloc_page(); if (!page) return NULL; addr = page_to_virt(page); memcpy(addr, xdpf, totalsize); nxdpf = addr; nxdpf->data = addr + headroom; nxdpf->frame_sz = PAGE_SIZE; nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0; return nxdpf; } __bpf_kfunc_start_defs(); /** * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp. * @ctx: XDP context pointer. * @timestamp: Return value pointer. * * Return: * * Returns 0 on success or ``-errno`` on error. * * ``-EOPNOTSUPP`` : means device driver does not implement kfunc * * ``-ENODATA`` : means no RX-timestamp available for this frame */ __bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) { return -EOPNOTSUPP; } /** * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash. * @ctx: XDP context pointer. * @hash: Return value pointer. * @rss_type: Return value pointer for RSS type. * * The RSS hash type (@rss_type) specifies what portion of packet headers NIC * hardware used when calculating RSS hash value. The RSS type can be decoded * via &enum xdp_rss_hash_type either matching on individual L3/L4 bits * ``XDP_RSS_L*`` or by combined traditional *RSS Hashing Types* * ``XDP_RSS_TYPE_L*``. * * Return: * * Returns 0 on success or ``-errno`` on error. * * ``-EOPNOTSUPP`` : means device driver doesn't implement kfunc * * ``-ENODATA`` : means no RX-hash available for this frame */ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type) { return -EOPNOTSUPP; } /** * bpf_xdp_metadata_rx_vlan_tag - Get XDP packet outermost VLAN tag * @ctx: XDP context pointer. * @vlan_proto: Destination pointer for VLAN Tag protocol identifier (TPID). * @vlan_tci: Destination pointer for VLAN TCI (VID + DEI + PCP) * * In case of success, ``vlan_proto`` contains *Tag protocol identifier (TPID)*, * usually ``ETH_P_8021Q`` or ``ETH_P_8021AD``, but some networks can use * custom TPIDs. ``vlan_proto`` is stored in **network byte order (BE)** * and should be used as follows: * ``if (vlan_proto == bpf_htons(ETH_P_8021Q)) do_something();`` * * ``vlan_tci`` contains the remaining 16 bits of a VLAN tag. * Driver is expected to provide those in **host byte order (usually LE)**, * so the bpf program should not perform byte conversion. * According to 802.1Q standard, *VLAN TCI (Tag control information)* * is a bit field that contains: * *VLAN identifier (VID)* that can be read with ``vlan_tci & 0xfff``, * *Drop eligible indicator (DEI)* - 1 bit, * *Priority code point (PCP)* - 3 bits. * For detailed meaning of DEI and PCP, please refer to other sources. * * Return: * * Returns 0 on success or ``-errno`` on error. * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc * * ``-ENODATA`` : VLAN tag was not stripped or is not available */ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci) { return -EOPNOTSUPP; } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(xdp_metadata_kfunc_ids) #define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_KFUNCS_END(xdp_metadata_kfunc_ids) static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { .owner = THIS_MODULE, .set = &xdp_metadata_kfunc_ids, }; BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted) #define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC u32 bpf_xdp_metadata_kfunc_id(int id) { /* xdp_metadata_kfunc_ids is sorted and can't be used */ return xdp_metadata_kfunc_ids_unsorted[id]; } bool bpf_dev_bound_kfunc_id(u32 btf_id) { return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id); } static int __init xdp_metadata_init(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set); } late_initcall(xdp_metadata_init); void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val) { val &= NETDEV_XDP_ACT_MASK; if (dev->xdp_features == val) return; netdev_assert_locked_or_invisible(dev); dev->xdp_features = val; if (dev->reg_state == NETREG_REGISTERED) call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); } EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) { netdev_lock(dev); xdp_set_features_flag_locked(dev, val); netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_set_features_flag); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) { xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT); if (support_sg) val |= NETDEV_XDP_ACT_NDO_XMIT_SG; xdp_set_features_flag(dev, val); } EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); void xdp_features_clear_redirect_target(struct net_device *dev) { xdp_features_t val = dev->xdp_features; val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG); xdp_set_features_flag(dev, val); } EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 // SPDX-License-Identifier: GPL-2.0-only /* * lec.c: Lan Emulation driver * * Marko Kiiskila <mkiiskila@yahoo.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/capability.h> /* We are ethernet device */ #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <net/sock.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <asm/byteorder.h> #include <linux/uaccess.h> #include <net/arp.h> #include <net/dst.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/seq_file.h> /* And atm device */ #include <linux/atmdev.h> #include <linux/atmlec.h> /* Proxy LEC knows about bridging */ #if IS_ENABLED(CONFIG_BRIDGE) #include "../bridge/br_private.h" static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 }; #endif /* Modular too */ #include <linux/module.h> #include <linux/init.h> /* Hardening for Spectre-v1 */ #include <linux/nospec.h> #include "lec.h" #include "lec_arpc.h" #include "resources.h" #define DUMP_PACKETS 0 /* * 0 = None, * 1 = 30 first bytes * 2 = Whole packet */ #define LEC_UNRES_QUE_LEN 8 /* * number of tx packets to queue for a * single destination while waiting for SVC */ static int lec_open(struct net_device *dev); static netdev_tx_t lec_start_xmit(struct sk_buff *skb, struct net_device *dev); static int lec_close(struct net_device *dev); static struct lec_arp_table *lec_arp_find(struct lec_priv *priv, const unsigned char *mac_addr); static int lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove); /* LANE2 functions */ static void lane2_associate_ind(struct net_device *dev, const u8 *mac_address, const u8 *tlvs, u32 sizeoftlvs); static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force, u8 **tlvs, u32 *sizeoftlvs); static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst, const u8 *tlvs, u32 sizeoftlvs); static int lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr, unsigned long permanent); static void lec_arp_check_empties(struct lec_priv *priv, struct atm_vcc *vcc, struct sk_buff *skb); static void lec_arp_destroy(struct lec_priv *priv); static void lec_arp_init(struct lec_priv *priv); static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv, const unsigned char *mac_to_find, int is_rdesc, struct lec_arp_table **ret_entry); static void lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, const unsigned char *atm_addr, unsigned long remoteflag, unsigned int targetless_le_arp); static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id); static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc); static void lec_set_flush_tran_id(struct lec_priv *priv, const unsigned char *atm_addr, unsigned long tran_id); static void lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, struct atm_vcc *vcc, void (*old_push)(struct atm_vcc *vcc, struct sk_buff *skb)); static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc); /* must be done under lec_arp_lock */ static inline void lec_arp_hold(struct lec_arp_table *entry) { refcount_inc(&entry->usage); } static inline void lec_arp_put(struct lec_arp_table *entry) { if (refcount_dec_and_test(&entry->usage)) kfree(entry); } static struct lane2_ops lane2_ops = { .resolve = lane2_resolve, /* spec 3.1.3 */ .associate_req = lane2_associate_req, /* spec 3.1.4 */ .associate_indicator = NULL /* spec 3.1.5 */ }; static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; /* Device structures */ static struct net_device *dev_lec[MAX_LEC_ITF]; #if IS_ENABLED(CONFIG_BRIDGE) static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) { char *buff; struct lec_priv *priv; /* * Check if this is a BPDU. If so, ask zeppelin to send * LE_TOPOLOGY_REQUEST with the same value of Topology Change bit * as the Config BPDU has */ buff = skb->data + skb->dev->hard_header_len; if (*buff++ == 0x42 && *buff++ == 0x42 && *buff++ == 0x03) { struct sock *sk; struct sk_buff *skb2; struct atmlec_msg *mesg; skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); if (skb2 == NULL) return; skb2->len = sizeof(struct atmlec_msg); mesg = (struct atmlec_msg *)skb2->data; mesg->type = l_topology_change; buff += 4; mesg->content.normal.flag = *buff & 0x01; /* 0x01 is topology change */ priv = netdev_priv(dev); atm_force_charge(priv->lecd, skb2->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb2); sk->sk_data_ready(sk); } } #endif /* IS_ENABLED(CONFIG_BRIDGE) */ /* * Open/initialize the netdevice. This is called (in the current kernel) * sometime after booting when the 'ifconfig' program is run. * * This routine should set everything up anew at each open, even * registers that "should" only need to be set once at boot, so that * there is non-reboot way to recover if something goes wrong. */ static int lec_open(struct net_device *dev) { netif_start_queue(dev); return 0; } static void lec_send(struct atm_vcc *vcc, struct sk_buff *skb) { struct net_device *dev = skb->dev; unsigned int len = skb->len; ATM_SKB(skb)->vcc = vcc; atm_account_tx(vcc, skb); if (vcc->send(vcc, skb) < 0) { dev->stats.tx_dropped++; return; } dev->stats.tx_packets++; dev->stats.tx_bytes += len; } static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue) { pr_info("%s\n", dev->name); netif_trans_update(dev); netif_wake_queue(dev); } static netdev_tx_t lec_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *skb2; struct lec_priv *priv = netdev_priv(dev); struct lecdatahdr_8023 *lec_h; struct atm_vcc *vcc; struct lec_arp_table *entry; unsigned char *dst; int min_frame_size; int is_rdesc; pr_debug("called\n"); if (!priv->lecd) { pr_info("%s:No lecd attached\n", dev->name); dev->stats.tx_errors++; netif_stop_queue(dev); kfree_skb(skb); return NETDEV_TX_OK; } pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n", (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb), (long)skb_end_pointer(skb)); #if IS_ENABLED(CONFIG_BRIDGE) if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0) lec_handle_bridge(skb, dev); #endif /* Make sure we have room for lec_id */ if (skb_headroom(skb) < 2) { pr_debug("reallocating skb\n"); skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN); if (unlikely(!skb2)) { kfree_skb(skb); return NETDEV_TX_OK; } consume_skb(skb); skb = skb2; } skb_push(skb, 2); /* Put le header to place */ lec_h = (struct lecdatahdr_8023 *)skb->data; lec_h->le_header = htons(priv->lecid); #if DUMP_PACKETS >= 2 #define MAX_DUMP_SKB 99 #elif DUMP_PACKETS >= 1 #define MAX_DUMP_SKB 30 #endif #if DUMP_PACKETS >= 1 printk(KERN_DEBUG "%s: send datalen:%ld lecid:%4.4x\n", dev->name, skb->len, priv->lecid); print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1, skb->data, min(skb->len, MAX_DUMP_SKB), true); #endif /* DUMP_PACKETS >= 1 */ /* Minimum ethernet-frame size */ min_frame_size = LEC_MINIMUM_8023_SIZE; if (skb->len < min_frame_size) { if ((skb->len + skb_tailroom(skb)) < min_frame_size) { skb2 = skb_copy_expand(skb, 0, min_frame_size - skb->truesize, GFP_ATOMIC); dev_kfree_skb(skb); if (skb2 == NULL) { dev->stats.tx_dropped++; return NETDEV_TX_OK; } skb = skb2; } skb_put(skb, min_frame_size - skb->len); } /* Send to right vcc */ is_rdesc = 0; dst = lec_h->h_dest; entry = NULL; vcc = lec_arp_resolve(priv, dst, is_rdesc, &entry); pr_debug("%s:vcc:%p vcc_flags:%lx, entry:%p\n", dev->name, vcc, vcc ? vcc->flags : 0, entry); if (!vcc || !test_bit(ATM_VF_READY, &vcc->flags)) { if (entry && (entry->tx_wait.qlen < LEC_UNRES_QUE_LEN)) { pr_debug("%s:queuing packet, MAC address %pM\n", dev->name, lec_h->h_dest); skb_queue_tail(&entry->tx_wait, skb); } else { pr_debug("%s:tx queue full or no arp entry, dropping, MAC address: %pM\n", dev->name, lec_h->h_dest); dev->stats.tx_dropped++; dev_kfree_skb(skb); } goto out; } #if DUMP_PACKETS > 0 printk(KERN_DEBUG "%s:sending to vpi:%d vci:%d\n", dev->name, vcc->vpi, vcc->vci); #endif /* DUMP_PACKETS > 0 */ while (entry && (skb2 = skb_dequeue(&entry->tx_wait))) { pr_debug("emptying tx queue, MAC address %pM\n", lec_h->h_dest); lec_send(vcc, skb2); } lec_send(vcc, skb); if (!atm_may_send(vcc, 0)) { struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); vpriv->xoff = 1; netif_stop_queue(dev); /* * vcc->pop() might have occurred in between, making * the vcc usuable again. Since xmit is serialized, * this is the only situation we have to re-test. */ if (atm_may_send(vcc, 0)) netif_wake_queue(dev); } out: if (entry) lec_arp_put(entry); netif_trans_update(dev); return NETDEV_TX_OK; } /* The inverse routine to net_open(). */ static int lec_close(struct net_device *dev) { netif_stop_queue(dev); return 0; } static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) { static const u8 zero_addr[ETH_ALEN] = {}; unsigned long flags; struct net_device *dev = (struct net_device *)vcc->proto_data; struct lec_priv *priv = netdev_priv(dev); struct atmlec_msg *mesg; struct lec_arp_table *entry; char *tmp; /* FIXME */ WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc)); mesg = (struct atmlec_msg *)skb->data; tmp = skb->data; tmp += sizeof(struct atmlec_msg); pr_debug("%s: msg from zeppelin:%d\n", dev->name, mesg->type); switch (mesg->type) { case l_set_mac_addr: eth_hw_addr_set(dev, mesg->content.normal.mac_addr); break; case l_del_mac_addr: eth_hw_addr_set(dev, zero_addr); break; case l_addr_delete: lec_addr_delete(priv, mesg->content.normal.atm_addr, mesg->content.normal.flag); break; case l_topology_change: priv->topology_change = mesg->content.normal.flag; break; case l_flush_complete: lec_flush_complete(priv, mesg->content.normal.flag); break; case l_narp_req: /* LANE2: see 7.1.35 in the lane2 spec */ spin_lock_irqsave(&priv->lec_arp_lock, flags); entry = lec_arp_find(priv, mesg->content.normal.mac_addr); lec_arp_remove(priv, entry); spin_unlock_irqrestore(&priv->lec_arp_lock, flags); if (mesg->content.normal.no_source_le_narp) break; fallthrough; case l_arp_update: lec_arp_update(priv, mesg->content.normal.mac_addr, mesg->content.normal.atm_addr, mesg->content.normal.flag, mesg->content.normal.targetless_le_arp); pr_debug("in l_arp_update\n"); if (mesg->sizeoftlvs != 0) { /* LANE2 3.1.5 */ pr_debug("LANE2 3.1.5, got tlvs, size %d\n", mesg->sizeoftlvs); lane2_associate_ind(dev, mesg->content.normal.mac_addr, tmp, mesg->sizeoftlvs); } break; case l_config: priv->maximum_unknown_frame_count = mesg->content.config.maximum_unknown_frame_count; priv->max_unknown_frame_time = (mesg->content.config.max_unknown_frame_time * HZ); priv->max_retry_count = mesg->content.config.max_retry_count; priv->aging_time = (mesg->content.config.aging_time * HZ); priv->forward_delay_time = (mesg->content.config.forward_delay_time * HZ); priv->arp_response_time = (mesg->content.config.arp_response_time * HZ); priv->flush_timeout = (mesg->content.config.flush_timeout * HZ); priv->path_switching_delay = (mesg->content.config.path_switching_delay * HZ); priv->lane_version = mesg->content.config.lane_version; /* LANE2 */ priv->lane2_ops = NULL; if (priv->lane_version > 1) priv->lane2_ops = &lane2_ops; rtnl_lock(); if (dev_set_mtu(dev, mesg->content.config.mtu)) pr_info("%s: change_mtu to %d failed\n", dev->name, mesg->content.config.mtu); rtnl_unlock(); priv->is_proxy = mesg->content.config.is_proxy; break; case l_flush_tran_id: lec_set_flush_tran_id(priv, mesg->content.normal.atm_addr, mesg->content.normal.flag); break; case l_set_lecid: priv->lecid = (unsigned short)(0xffff & mesg->content.normal.flag); break; case l_should_bridge: #if IS_ENABLED(CONFIG_BRIDGE) { pr_debug("%s: bridge zeppelin asks about %pM\n", dev->name, mesg->content.proxy.mac_addr); if (br_fdb_test_addr_hook == NULL) break; if (br_fdb_test_addr_hook(dev, mesg->content.proxy.mac_addr)) { /* hit from bridge table, send LE_ARP_RESPONSE */ struct sk_buff *skb2; struct sock *sk; pr_debug("%s: entry found, responding to zeppelin\n", dev->name); skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); if (skb2 == NULL) break; skb2->len = sizeof(struct atmlec_msg); skb_copy_to_linear_data(skb2, mesg, sizeof(*mesg)); atm_force_charge(priv->lecd, skb2->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb2); sk->sk_data_ready(sk); } } #endif /* IS_ENABLED(CONFIG_BRIDGE) */ break; default: pr_info("%s: Unknown message type %d\n", dev->name, mesg->type); dev_kfree_skb(skb); return -EINVAL; } dev_kfree_skb(skb); return 0; } static void lec_atm_close(struct atm_vcc *vcc) { struct sk_buff *skb; struct net_device *dev = (struct net_device *)vcc->proto_data; struct lec_priv *priv = netdev_priv(dev); priv->lecd = NULL; /* Do something needful? */ netif_stop_queue(dev); lec_arp_destroy(priv); if (skb_peek(&sk_atm(vcc)->sk_receive_queue)) pr_info("%s closing with messages pending\n", dev->name); while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) { atm_return(vcc, skb->truesize); dev_kfree_skb(skb); } pr_info("%s: Shut down!\n", dev->name); module_put(THIS_MODULE); } static const struct atmdev_ops lecdev_ops = { .close = lec_atm_close, .send = lec_atm_send }; static struct atm_dev lecatm_dev = { .ops = &lecdev_ops, .type = "lec", .number = 999, /* dummy device number */ .lock = __SPIN_LOCK_UNLOCKED(lecatm_dev.lock) }; /* * LANE2: new argument struct sk_buff *data contains * the LE_ARP based TLVs introduced in the LANE2 spec */ static int send_to_lecd(struct lec_priv *priv, atmlec_msg_type type, const unsigned char *mac_addr, const unsigned char *atm_addr, struct sk_buff *data) { struct sock *sk; struct sk_buff *skb; struct atmlec_msg *mesg; if (!priv || !priv->lecd) return -1; skb = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); if (!skb) return -1; skb->len = sizeof(struct atmlec_msg); mesg = (struct atmlec_msg *)skb->data; memset(mesg, 0, sizeof(struct atmlec_msg)); mesg->type = type; if (data != NULL) mesg->sizeoftlvs = data->len; if (mac_addr) ether_addr_copy(mesg->content.normal.mac_addr, mac_addr); else mesg->content.normal.targetless_le_arp = 1; if (atm_addr) memcpy(&mesg->content.normal.atm_addr, atm_addr, ATM_ESA_LEN); atm_force_charge(priv->lecd, skb->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); if (data != NULL) { pr_debug("about to send %d bytes of data\n", data->len); atm_force_charge(priv->lecd, data->truesize); skb_queue_tail(&sk->sk_receive_queue, data); sk->sk_data_ready(sk); } return 0; } static void lec_set_multicast_list(struct net_device *dev) { /* * by default, all multicast frames arrive over the bus. * eventually support selective multicast service */ } static const struct net_device_ops lec_netdev_ops = { .ndo_open = lec_open, .ndo_stop = lec_close, .ndo_start_xmit = lec_start_xmit, .ndo_tx_timeout = lec_tx_timeout, .ndo_set_rx_mode = lec_set_multicast_list, }; static const unsigned char lec_ctrl_magic[] = { 0xff, 0x00, 0x01, 0x01 }; #define LEC_DATA_DIRECT_8023 2 #define LEC_DATA_DIRECT_8025 3 static int lec_is_data_direct(struct atm_vcc *vcc) { return ((vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8023) || (vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8025)); } static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb) { unsigned long flags; struct net_device *dev = (struct net_device *)vcc->proto_data; struct lec_priv *priv = netdev_priv(dev); #if DUMP_PACKETS > 0 printk(KERN_DEBUG "%s: vcc vpi:%d vci:%d\n", dev->name, vcc->vpi, vcc->vci); #endif if (!skb) { pr_debug("%s: null skb\n", dev->name); lec_vcc_close(priv, vcc); return; } #if DUMP_PACKETS >= 2 #define MAX_SKB_DUMP 99 #elif DUMP_PACKETS >= 1 #define MAX_SKB_DUMP 30 #endif #if DUMP_PACKETS > 0 printk(KERN_DEBUG "%s: rcv datalen:%ld lecid:%4.4x\n", dev->name, skb->len, priv->lecid); print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1, skb->data, min(MAX_SKB_DUMP, skb->len), true); #endif /* DUMP_PACKETS > 0 */ if (memcmp(skb->data, lec_ctrl_magic, 4) == 0) { /* Control frame, to daemon */ struct sock *sk = sk_atm(vcc); pr_debug("%s: To daemon\n", dev->name); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); } else { /* Data frame, queue to protocol handlers */ struct lec_arp_table *entry; unsigned char *src, *dst; atm_return(vcc, skb->truesize); if (*(__be16 *) skb->data == htons(priv->lecid) || !priv->lecd || !(dev->flags & IFF_UP)) { /* * Probably looping back, or if lecd is missing, * lecd has gone down */ pr_debug("Ignoring frame...\n"); dev_kfree_skb(skb); return; } dst = ((struct lecdatahdr_8023 *)skb->data)->h_dest; /* * If this is a Data Direct VCC, and the VCC does not match * the LE_ARP cache entry, delete the LE_ARP cache entry. */ spin_lock_irqsave(&priv->lec_arp_lock, flags); if (lec_is_data_direct(vcc)) { src = ((struct lecdatahdr_8023 *)skb->data)->h_source; entry = lec_arp_find(priv, src); if (entry && entry->vcc != vcc) { lec_arp_remove(priv, entry); lec_arp_put(entry); } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); if (!(dst[0] & 0x01) && /* Never filter Multi/Broadcast */ !priv->is_proxy && /* Proxy wants all the packets */ memcmp(dst, dev->dev_addr, dev->addr_len)) { dev_kfree_skb(skb); return; } if (!hlist_empty(&priv->lec_arp_empty_ones)) lec_arp_check_empties(priv, vcc, skb); skb_pull(skb, 2); /* skip lec_id */ skb->protocol = eth_type_trans(skb, dev); dev->stats.rx_packets++; dev->stats.rx_bytes += skb->len; memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); netif_rx(skb); } } static void lec_pop(struct atm_vcc *vcc, struct sk_buff *skb) { struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); struct net_device *dev = skb->dev; if (vpriv == NULL) { pr_info("vpriv = NULL!?!?!?\n"); return; } vpriv->old_pop(vcc, skb); if (vpriv->xoff && atm_may_send(vcc, 0)) { vpriv->xoff = 0; if (netif_running(dev) && netif_queue_stopped(dev)) netif_wake_queue(dev); } } static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) { struct lec_vcc_priv *vpriv; int bytes_left; struct atmlec_ioc ioc_data; /* Lecd must be up in this case */ bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc)); if (bytes_left != 0) pr_info("copy from user failed for %d bytes\n", bytes_left); if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF) return -EINVAL; ioc_data.dev_num = array_index_nospec(ioc_data.dev_num, MAX_LEC_ITF); if (!dev_lec[ioc_data.dev_num]) return -EINVAL; vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL); if (!vpriv) return -ENOMEM; vpriv->xoff = 0; vpriv->old_pop = vcc->pop; vcc->user_back = vpriv; vcc->pop = lec_pop; lec_vcc_added(netdev_priv(dev_lec[ioc_data.dev_num]), &ioc_data, vcc, vcc->push); vcc->proto_data = dev_lec[ioc_data.dev_num]; vcc->push = lec_push; return 0; } static int lec_mcast_attach(struct atm_vcc *vcc, int arg) { if (arg < 0 || arg >= MAX_LEC_ITF) return -EINVAL; arg = array_index_nospec(arg, MAX_LEC_ITF); if (!dev_lec[arg]) return -EINVAL; vcc->proto_data = dev_lec[arg]; return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc); } /* Initialize device. */ static int lecd_attach(struct atm_vcc *vcc, int arg) { int i; struct lec_priv *priv; if (arg < 0) arg = 0; if (arg >= MAX_LEC_ITF) return -EINVAL; i = array_index_nospec(arg, MAX_LEC_ITF); if (!dev_lec[i]) { int size; size = sizeof(struct lec_priv); dev_lec[i] = alloc_etherdev(size); if (!dev_lec[i]) return -ENOMEM; dev_lec[i]->netdev_ops = &lec_netdev_ops; dev_lec[i]->max_mtu = 18190; snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i); if (register_netdev(dev_lec[i])) { free_netdev(dev_lec[i]); return -EINVAL; } priv = netdev_priv(dev_lec[i]); } else { priv = netdev_priv(dev_lec[i]); if (priv->lecd) return -EADDRINUSE; } lec_arp_init(priv); priv->itfnum = i; /* LANE2 addition */ priv->lecd = vcc; vcc->dev = &lecatm_dev; vcc_insert_socket(sk_atm(vcc)); vcc->proto_data = dev_lec[i]; set_bit(ATM_VF_META, &vcc->flags); set_bit(ATM_VF_READY, &vcc->flags); /* Set default values to these variables */ priv->maximum_unknown_frame_count = 1; priv->max_unknown_frame_time = (1 * HZ); priv->vcc_timeout_period = (1200 * HZ); priv->max_retry_count = 1; priv->aging_time = (300 * HZ); priv->forward_delay_time = (15 * HZ); priv->topology_change = 0; priv->arp_response_time = (1 * HZ); priv->flush_timeout = (4 * HZ); priv->path_switching_delay = (6 * HZ); if (dev_lec[i]->flags & IFF_UP) netif_start_queue(dev_lec[i]); __module_get(THIS_MODULE); return i; } #ifdef CONFIG_PROC_FS static const char *lec_arp_get_status_string(unsigned char status) { static const char *const lec_arp_status_string[] = { "ESI_UNKNOWN ", "ESI_ARP_PENDING ", "ESI_VC_PENDING ", "<Undefined> ", "ESI_FLUSH_PENDING ", "ESI_FORWARD_DIRECT" }; if (status > ESI_FORWARD_DIRECT) status = 3; /* ESI_UNDEFINED */ return lec_arp_status_string[status]; } static void lec_info(struct seq_file *seq, struct lec_arp_table *entry) { seq_printf(seq, "%pM ", entry->mac_addr); seq_printf(seq, "%*phN ", ATM_ESA_LEN, entry->atm_addr); seq_printf(seq, "%s %4.4x", lec_arp_get_status_string(entry->status), entry->flags & 0xffff); if (entry->vcc) seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci); else seq_printf(seq, " "); if (entry->recv_vcc) { seq_printf(seq, " %3d %3d", entry->recv_vcc->vpi, entry->recv_vcc->vci); } seq_putc(seq, '\n'); } struct lec_state { unsigned long flags; struct lec_priv *locked; struct hlist_node *node; struct net_device *dev; int itf; int arp_table; int misc_table; }; static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl, loff_t *l) { struct hlist_node *e = state->node; if (!e) e = tbl->first; if (e == SEQ_START_TOKEN) { e = tbl->first; --*l; } for (; e; e = e->next) { if (--*l < 0) break; } state->node = e; return (*l < 0) ? state : NULL; } static void *lec_arp_walk(struct lec_state *state, loff_t *l, struct lec_priv *priv) { void *v = NULL; int p; for (p = state->arp_table; p < LEC_ARP_TABLE_SIZE; p++) { v = lec_tbl_walk(state, &priv->lec_arp_tables[p], l); if (v) break; } state->arp_table = p; return v; } static void *lec_misc_walk(struct lec_state *state, loff_t *l, struct lec_priv *priv) { struct hlist_head *lec_misc_tables[] = { &priv->lec_arp_empty_ones, &priv->lec_no_forward, &priv->mcast_fwds }; void *v = NULL; int q; for (q = state->misc_table; q < ARRAY_SIZE(lec_misc_tables); q++) { v = lec_tbl_walk(state, lec_misc_tables[q], l); if (v) break; } state->misc_table = q; return v; } static void *lec_priv_walk(struct lec_state *state, loff_t *l, struct lec_priv *priv) { if (!state->locked) { state->locked = priv; spin_lock_irqsave(&priv->lec_arp_lock, state->flags); } if (!lec_arp_walk(state, l, priv) && !lec_misc_walk(state, l, priv)) { spin_unlock_irqrestore(&priv->lec_arp_lock, state->flags); state->locked = NULL; /* Partial state reset for the next time we get called */ state->arp_table = state->misc_table = 0; } return state->locked; } static void *lec_itf_walk(struct lec_state *state, loff_t *l) { struct net_device *dev; void *v; dev = state->dev ? state->dev : dev_lec[state->itf]; v = (dev && netdev_priv(dev)) ? lec_priv_walk(state, l, netdev_priv(dev)) : NULL; if (!v && dev) { dev_put(dev); /* Partial state reset for the next time we get called */ dev = NULL; } state->dev = dev; return v; } static void *lec_get_idx(struct lec_state *state, loff_t l) { void *v = NULL; for (; state->itf < MAX_LEC_ITF; state->itf++) { v = lec_itf_walk(state, &l); if (v) break; } return v; } static void *lec_seq_start(struct seq_file *seq, loff_t *pos) { struct lec_state *state = seq->private; state->itf = 0; state->dev = NULL; state->locked = NULL; state->arp_table = 0; state->misc_table = 0; state->node = SEQ_START_TOKEN; return *pos ? lec_get_idx(state, *pos) : SEQ_START_TOKEN; } static void lec_seq_stop(struct seq_file *seq, void *v) { struct lec_state *state = seq->private; if (state->dev) { spin_unlock_irqrestore(&state->locked->lec_arp_lock, state->flags); dev_put(state->dev); } } static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct lec_state *state = seq->private; ++*pos; return lec_get_idx(state, 1); } static int lec_seq_show(struct seq_file *seq, void *v) { static const char lec_banner[] = "Itf MAC ATM destination" " Status Flags " "VPI/VCI Recv VPI/VCI\n"; if (v == SEQ_START_TOKEN) seq_puts(seq, lec_banner); else { struct lec_state *state = seq->private; struct net_device *dev = state->dev; struct lec_arp_table *entry = hlist_entry(state->node, struct lec_arp_table, next); seq_printf(seq, "%s ", dev->name); lec_info(seq, entry); } return 0; } static const struct seq_operations lec_seq_ops = { .start = lec_seq_start, .next = lec_seq_next, .stop = lec_seq_stop, .show = lec_seq_show, }; #endif static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct atm_vcc *vcc = ATM_SD(sock); int err = 0; switch (cmd) { case ATMLEC_CTRL: case ATMLEC_MCAST: case ATMLEC_DATA: if (!capable(CAP_NET_ADMIN)) return -EPERM; break; default: return -ENOIOCTLCMD; } switch (cmd) { case ATMLEC_CTRL: err = lecd_attach(vcc, (int)arg); if (err >= 0) sock->state = SS_CONNECTED; break; case ATMLEC_MCAST: err = lec_mcast_attach(vcc, (int)arg); break; case ATMLEC_DATA: err = lec_vcc_attach(vcc, (void __user *)arg); break; } return err; } static struct atm_ioctl lane_ioctl_ops = { .owner = THIS_MODULE, .ioctl = lane_ioctl, }; static int __init lane_module_init(void) { #ifdef CONFIG_PROC_FS struct proc_dir_entry *p; p = proc_create_seq_private("lec", 0444, atm_proc_root, &lec_seq_ops, sizeof(struct lec_state), NULL); if (!p) { pr_err("Unable to initialize /proc/net/atm/lec\n"); return -ENOMEM; } #endif register_atm_ioctl(&lane_ioctl_ops); pr_info("lec.c: initialized\n"); return 0; } static void __exit lane_module_cleanup(void) { int i; #ifdef CONFIG_PROC_FS remove_proc_entry("lec", atm_proc_root); #endif deregister_atm_ioctl(&lane_ioctl_ops); for (i = 0; i < MAX_LEC_ITF; i++) { if (dev_lec[i] != NULL) { unregister_netdev(dev_lec[i]); free_netdev(dev_lec[i]); dev_lec[i] = NULL; } } } module_init(lane_module_init); module_exit(lane_module_cleanup); /* * LANE2: 3.1.3, LE_RESOLVE.request * Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs. * If sizeoftlvs == NULL the default TLVs associated with this * lec will be used. * If dst_mac == NULL, targetless LE_ARP will be sent */ static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force, u8 **tlvs, u32 *sizeoftlvs) { unsigned long flags; struct lec_priv *priv = netdev_priv(dev); struct lec_arp_table *table; struct sk_buff *skb; int retval; if (force == 0) { spin_lock_irqsave(&priv->lec_arp_lock, flags); table = lec_arp_find(priv, dst_mac); spin_unlock_irqrestore(&priv->lec_arp_lock, flags); if (table == NULL) return -1; *tlvs = kmemdup(table->tlvs, table->sizeoftlvs, GFP_ATOMIC); if (*tlvs == NULL) return -1; *sizeoftlvs = table->sizeoftlvs; return 0; } if (sizeoftlvs == NULL) retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, NULL); else { skb = alloc_skb(*sizeoftlvs, GFP_ATOMIC); if (skb == NULL) return -1; skb->len = *sizeoftlvs; skb_copy_to_linear_data(skb, *tlvs, *sizeoftlvs); retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, skb); } return retval; } /* * LANE2: 3.1.4, LE_ASSOCIATE.request * Associate the *tlvs with the *lan_dst address. * Will overwrite any previous association * Returns 1 for success, 0 for failure (out of memory) * */ static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst, const u8 *tlvs, u32 sizeoftlvs) { int retval; struct sk_buff *skb; struct lec_priv *priv = netdev_priv(dev); if (!ether_addr_equal(lan_dst, dev->dev_addr)) return 0; /* not our mac address */ kfree(priv->tlvs); /* NULL if there was no previous association */ priv->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL); if (priv->tlvs == NULL) return 0; priv->sizeoftlvs = sizeoftlvs; skb = alloc_skb(sizeoftlvs, GFP_ATOMIC); if (skb == NULL) return 0; skb->len = sizeoftlvs; skb_copy_to_linear_data(skb, tlvs, sizeoftlvs); retval = send_to_lecd(priv, l_associate_req, NULL, NULL, skb); if (retval != 0) pr_info("lec.c: lane2_associate_req() failed\n"); /* * If the previous association has changed we must * somehow notify other LANE entities about the change */ return 1; } /* * LANE2: 3.1.5, LE_ASSOCIATE.indication * */ static void lane2_associate_ind(struct net_device *dev, const u8 *mac_addr, const u8 *tlvs, u32 sizeoftlvs) { #if 0 int i = 0; #endif struct lec_priv *priv = netdev_priv(dev); #if 0 /* * Why have the TLVs in LE_ARP entries * since we do not use them? When you * uncomment this code, make sure the * TLVs get freed when entry is killed */ struct lec_arp_table *entry = lec_arp_find(priv, mac_addr); if (entry == NULL) return; /* should not happen */ kfree(entry->tlvs); entry->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL); if (entry->tlvs == NULL) return; entry->sizeoftlvs = sizeoftlvs; #endif #if 0 pr_info("\n"); pr_info("dump of tlvs, sizeoftlvs=%d\n", sizeoftlvs); while (i < sizeoftlvs) pr_cont("%02x ", tlvs[i++]); pr_cont("\n"); #endif /* tell MPOA about the TLVs we saw */ if (priv->lane2_ops && priv->lane2_ops->associate_indicator) { priv->lane2_ops->associate_indicator(dev, mac_addr, tlvs, sizeoftlvs); } } /* * Here starts what used to lec_arpc.c * * lec_arpc.c was added here when making * lane client modular. October 1997 */ #include <linux/types.h> #include <linux/timer.h> #include <linux/param.h> #include <linux/atomic.h> #include <linux/inetdevice.h> #include <net/route.h> #if 0 #define pr_debug(format, args...) /* #define pr_debug printk */ #endif #define DEBUG_ARP_TABLE 0 #define LEC_ARP_REFRESH_INTERVAL (3*HZ) static void lec_arp_check_expire(struct work_struct *work); static void lec_arp_expire_arp(struct timer_list *t); /* * Arp table funcs */ #define HASH(ch) (ch & (LEC_ARP_TABLE_SIZE - 1)) /* * Initialization of arp-cache */ static void lec_arp_init(struct lec_priv *priv) { unsigned short i; for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) INIT_HLIST_HEAD(&priv->lec_arp_tables[i]); INIT_HLIST_HEAD(&priv->lec_arp_empty_ones); INIT_HLIST_HEAD(&priv->lec_no_forward); INIT_HLIST_HEAD(&priv->mcast_fwds); spin_lock_init(&priv->lec_arp_lock); INIT_DELAYED_WORK(&priv->lec_arp_work, lec_arp_check_expire); schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL); } static void lec_arp_clear_vccs(struct lec_arp_table *entry) { if (entry->vcc) { struct atm_vcc *vcc = entry->vcc; struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); struct net_device *dev = (struct net_device *)vcc->proto_data; vcc->pop = vpriv->old_pop; if (vpriv->xoff) netif_wake_queue(dev); kfree(vpriv); vcc->user_back = NULL; vcc->push = entry->old_push; vcc_release_async(vcc, -EPIPE); entry->vcc = NULL; } if (entry->recv_vcc) { struct atm_vcc *vcc = entry->recv_vcc; struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); kfree(vpriv); vcc->user_back = NULL; entry->recv_vcc->push = entry->old_recv_push; vcc_release_async(entry->recv_vcc, -EPIPE); entry->recv_vcc = NULL; } } /* * Insert entry to lec_arp_table * LANE2: Add to the end of the list to satisfy 8.1.13 */ static inline void lec_arp_add(struct lec_priv *priv, struct lec_arp_table *entry) { struct hlist_head *tmp; tmp = &priv->lec_arp_tables[HASH(entry->mac_addr[ETH_ALEN - 1])]; hlist_add_head(&entry->next, tmp); pr_debug("Added entry:%pM\n", entry->mac_addr); } /* * Remove entry from lec_arp_table */ static int lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove) { struct lec_arp_table *entry; int i, remove_vcc = 1; if (!to_remove) return -1; hlist_del(&to_remove->next); del_timer(&to_remove->timer); /* * If this is the only MAC connected to this VCC, * also tear down the VCC */ if (to_remove->status >= ESI_FLUSH_PENDING) { /* * ESI_FLUSH_PENDING, ESI_FORWARD_DIRECT */ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (memcmp(to_remove->atm_addr, entry->atm_addr, ATM_ESA_LEN) == 0) { remove_vcc = 0; break; } } } if (remove_vcc) lec_arp_clear_vccs(to_remove); } skb_queue_purge(&to_remove->tx_wait); /* FIXME: good place for this? */ pr_debug("Removed entry:%pM\n", to_remove->mac_addr); return 0; } #if DEBUG_ARP_TABLE static const char *get_status_string(unsigned char st) { switch (st) { case ESI_UNKNOWN: return "ESI_UNKNOWN"; case ESI_ARP_PENDING: return "ESI_ARP_PENDING"; case ESI_VC_PENDING: return "ESI_VC_PENDING"; case ESI_FLUSH_PENDING: return "ESI_FLUSH_PENDING"; case ESI_FORWARD_DIRECT: return "ESI_FORWARD_DIRECT"; } return "<UNKNOWN>"; } static void dump_arp_table(struct lec_priv *priv) { struct lec_arp_table *rulla; char buf[256]; int i, offset; pr_info("Dump %p:\n", priv); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(rulla, &priv->lec_arp_tables[i], next) { offset = 0; offset += sprintf(buf, "%d: %p\n", i, rulla); offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, rulla->vcc ? rulla->vcc->vci : 0, rulla->recv_vcc ? rulla->recv_vcc-> vpi : 0, rulla->recv_vcc ? rulla->recv_vcc-> vci : 0, rulla->last_used, rulla->timestamp, rulla->no_tries); offset += sprintf(buf + offset, "Flags:%x, Packets_flooded:%x, Status: %s ", rulla->flags, rulla->packets_flooded, get_status_string(rulla->status)); pr_info("%s\n", buf); } } if (!hlist_empty(&priv->lec_no_forward)) pr_info("No forward\n"); hlist_for_each_entry(rulla, &priv->lec_no_forward, next) { offset = 0; offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, rulla->vcc ? rulla->vcc->vci : 0, rulla->recv_vcc ? rulla->recv_vcc->vpi : 0, rulla->recv_vcc ? rulla->recv_vcc->vci : 0, rulla->last_used, rulla->timestamp, rulla->no_tries); offset += sprintf(buf + offset, "Flags:%x, Packets_flooded:%x, Status: %s ", rulla->flags, rulla->packets_flooded, get_status_string(rulla->status)); pr_info("%s\n", buf); } if (!hlist_empty(&priv->lec_arp_empty_ones)) pr_info("Empty ones\n"); hlist_for_each_entry(rulla, &priv->lec_arp_empty_ones, next) { offset = 0; offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, rulla->vcc ? rulla->vcc->vci : 0, rulla->recv_vcc ? rulla->recv_vcc->vpi : 0, rulla->recv_vcc ? rulla->recv_vcc->vci : 0, rulla->last_used, rulla->timestamp, rulla->no_tries); offset += sprintf(buf + offset, "Flags:%x, Packets_flooded:%x, Status: %s ", rulla->flags, rulla->packets_flooded, get_status_string(rulla->status)); pr_info("%s", buf); } if (!hlist_empty(&priv->mcast_fwds)) pr_info("Multicast Forward VCCs\n"); hlist_for_each_entry(rulla, &priv->mcast_fwds, next) { offset = 0; offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, rulla->vcc ? rulla->vcc->vci : 0, rulla->recv_vcc ? rulla->recv_vcc->vpi : 0, rulla->recv_vcc ? rulla->recv_vcc->vci : 0, rulla->last_used, rulla->timestamp, rulla->no_tries); offset += sprintf(buf + offset, "Flags:%x, Packets_flooded:%x, Status: %s ", rulla->flags, rulla->packets_flooded, get_status_string(rulla->status)); pr_info("%s\n", buf); } } #else #define dump_arp_table(priv) do { } while (0) #endif /* * Destruction of arp-cache */ static void lec_arp_destroy(struct lec_priv *priv) { unsigned long flags; struct hlist_node *next; struct lec_arp_table *entry; int i; cancel_delayed_work_sync(&priv->lec_arp_work); /* * Remove all entries */ spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { lec_arp_remove(priv, entry); lec_arp_put(entry); } INIT_HLIST_HEAD(&priv->lec_arp_tables[i]); } hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { del_timer_sync(&entry->timer); lec_arp_clear_vccs(entry); hlist_del(&entry->next); lec_arp_put(entry); } INIT_HLIST_HEAD(&priv->lec_arp_empty_ones); hlist_for_each_entry_safe(entry, next, &priv->lec_no_forward, next) { del_timer_sync(&entry->timer); lec_arp_clear_vccs(entry); hlist_del(&entry->next); lec_arp_put(entry); } INIT_HLIST_HEAD(&priv->lec_no_forward); hlist_for_each_entry_safe(entry, next, &priv->mcast_fwds, next) { /* No timer, LANEv2 7.1.20 and 2.3.5.3 */ lec_arp_clear_vccs(entry); hlist_del(&entry->next); lec_arp_put(entry); } INIT_HLIST_HEAD(&priv->mcast_fwds); priv->mcast_vcc = NULL; spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } /* * Find entry by mac_address */ static struct lec_arp_table *lec_arp_find(struct lec_priv *priv, const unsigned char *mac_addr) { struct hlist_head *head; struct lec_arp_table *entry; pr_debug("%pM\n", mac_addr); head = &priv->lec_arp_tables[HASH(mac_addr[ETH_ALEN - 1])]; hlist_for_each_entry(entry, head, next) { if (ether_addr_equal(mac_addr, entry->mac_addr)) return entry; } return NULL; } static struct lec_arp_table *make_entry(struct lec_priv *priv, const unsigned char *mac_addr) { struct lec_arp_table *to_return; to_return = kzalloc(sizeof(struct lec_arp_table), GFP_ATOMIC); if (!to_return) return NULL; ether_addr_copy(to_return->mac_addr, mac_addr); INIT_HLIST_NODE(&to_return->next); timer_setup(&to_return->timer, lec_arp_expire_arp, 0); to_return->last_used = jiffies; to_return->priv = priv; skb_queue_head_init(&to_return->tx_wait); refcount_set(&to_return->usage, 1); return to_return; } /* Arp sent timer expired */ static void lec_arp_expire_arp(struct timer_list *t) { struct lec_arp_table *entry; entry = from_timer(entry, t, timer); pr_debug("\n"); if (entry->status == ESI_ARP_PENDING) { if (entry->no_tries <= entry->priv->max_retry_count) { if (entry->is_rdesc) send_to_lecd(entry->priv, l_rdesc_arp_xmt, entry->mac_addr, NULL, NULL); else send_to_lecd(entry->priv, l_arp_xmt, entry->mac_addr, NULL, NULL); entry->no_tries++; } mod_timer(&entry->timer, jiffies + (1 * HZ)); } } /* Unknown/unused vcc expire, remove associated entry */ static void lec_arp_expire_vcc(struct timer_list *t) { unsigned long flags; struct lec_arp_table *to_remove = from_timer(to_remove, t, timer); struct lec_priv *priv = to_remove->priv; del_timer(&to_remove->timer); pr_debug("%p %p: vpi:%d vci:%d\n", to_remove, priv, to_remove->vcc ? to_remove->recv_vcc->vpi : 0, to_remove->vcc ? to_remove->recv_vcc->vci : 0); spin_lock_irqsave(&priv->lec_arp_lock, flags); hlist_del(&to_remove->next); spin_unlock_irqrestore(&priv->lec_arp_lock, flags); lec_arp_clear_vccs(to_remove); lec_arp_put(to_remove); } static bool __lec_arp_check_expire(struct lec_arp_table *entry, unsigned long now, struct lec_priv *priv) { unsigned long time_to_check; if ((entry->flags) & LEC_REMOTE_FLAG && priv->topology_change) time_to_check = priv->forward_delay_time; else time_to_check = priv->aging_time; pr_debug("About to expire: %lx - %lx > %lx\n", now, entry->last_used, time_to_check); if (time_after(now, entry->last_used + time_to_check) && !(entry->flags & LEC_PERMANENT_FLAG) && !(entry->mac_addr[0] & 0x01)) { /* LANE2: 7.1.20 */ /* Remove entry */ pr_debug("Entry timed out\n"); lec_arp_remove(priv, entry); lec_arp_put(entry); } else { /* Something else */ if ((entry->status == ESI_VC_PENDING || entry->status == ESI_ARP_PENDING) && time_after_eq(now, entry->timestamp + priv->max_unknown_frame_time)) { entry->timestamp = jiffies; entry->packets_flooded = 0; if (entry->status == ESI_VC_PENDING) send_to_lecd(priv, l_svc_setup, entry->mac_addr, entry->atm_addr, NULL); } if (entry->status == ESI_FLUSH_PENDING && time_after_eq(now, entry->timestamp + priv->path_switching_delay)) { lec_arp_hold(entry); return true; } } return false; } /* * Expire entries. * 1. Re-set timer * 2. For each entry, delete entries that have aged past the age limit. * 3. For each entry, depending on the status of the entry, perform * the following maintenance. * a. If status is ESI_VC_PENDING or ESI_ARP_PENDING then if the * tick_count is above the max_unknown_frame_time, clear * the tick_count to zero and clear the packets_flooded counter * to zero. This supports the packet rate limit per address * while flooding unknowns. * b. If the status is ESI_FLUSH_PENDING and the tick_count is greater * than or equal to the path_switching_delay, change the status * to ESI_FORWARD_DIRECT. This causes the flush period to end * regardless of the progress of the flush protocol. */ static void lec_arp_check_expire(struct work_struct *work) { unsigned long flags; struct lec_priv *priv = container_of(work, struct lec_priv, lec_arp_work.work); struct hlist_node *next; struct lec_arp_table *entry; unsigned long now; int i; pr_debug("%p\n", priv); now = jiffies; restart: spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { if (__lec_arp_check_expire(entry, now, priv)) { struct sk_buff *skb; struct atm_vcc *vcc = entry->vcc; spin_unlock_irqrestore(&priv->lec_arp_lock, flags); while ((skb = skb_dequeue(&entry->tx_wait))) lec_send(vcc, skb); entry->last_used = jiffies; entry->status = ESI_FORWARD_DIRECT; lec_arp_put(entry); goto restart; } } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL); } /* * Try to find vcc where mac_address is attached. * */ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv, const unsigned char *mac_to_find, int is_rdesc, struct lec_arp_table **ret_entry) { unsigned long flags; struct lec_arp_table *entry; struct atm_vcc *found; if (mac_to_find[0] & 0x01) { switch (priv->lane_version) { case 1: return priv->mcast_vcc; case 2: /* LANE2 wants arp for multicast addresses */ if (ether_addr_equal(mac_to_find, bus_mac)) return priv->mcast_vcc; break; default: break; } } spin_lock_irqsave(&priv->lec_arp_lock, flags); entry = lec_arp_find(priv, mac_to_find); if (entry) { if (entry->status == ESI_FORWARD_DIRECT) { /* Connection Ok */ entry->last_used = jiffies; lec_arp_hold(entry); *ret_entry = entry; found = entry->vcc; goto out; } /* * If the LE_ARP cache entry is still pending, reset count to 0 * so another LE_ARP request can be made for this frame. */ if (entry->status == ESI_ARP_PENDING) entry->no_tries = 0; /* * Data direct VC not yet set up, check to see if the unknown * frame count is greater than the limit. If the limit has * not been reached, allow the caller to send packet to * BUS. */ if (entry->status != ESI_FLUSH_PENDING && entry->packets_flooded < priv->maximum_unknown_frame_count) { entry->packets_flooded++; pr_debug("Flooding..\n"); found = priv->mcast_vcc; goto out; } /* * We got here because entry->status == ESI_FLUSH_PENDING * or BUS flood limit was reached for an entry which is * in ESI_ARP_PENDING or ESI_VC_PENDING state. */ lec_arp_hold(entry); *ret_entry = entry; pr_debug("entry->status %d entry->vcc %p\n", entry->status, entry->vcc); found = NULL; } else { /* No matching entry was found */ entry = make_entry(priv, mac_to_find); pr_debug("Making entry\n"); if (!entry) { found = priv->mcast_vcc; goto out; } lec_arp_add(priv, entry); /* We want arp-request(s) to be sent */ entry->packets_flooded = 1; entry->status = ESI_ARP_PENDING; entry->no_tries = 1; entry->last_used = entry->timestamp = jiffies; entry->is_rdesc = is_rdesc; if (entry->is_rdesc) send_to_lecd(priv, l_rdesc_arp_xmt, mac_to_find, NULL, NULL); else send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL); entry->timer.expires = jiffies + (1 * HZ); entry->timer.function = lec_arp_expire_arp; add_timer(&entry->timer); found = priv->mcast_vcc; } out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); return found; } static int lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr, unsigned long permanent) { unsigned long flags; struct hlist_node *next; struct lec_arp_table *entry; int i; pr_debug("\n"); spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN) && (permanent || !(entry->flags & LEC_PERMANENT_FLAG))) { lec_arp_remove(priv, entry); lec_arp_put(entry); } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); return 0; } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); return -1; } /* * Notifies: Response to arp_request (atm_addr != NULL) */ static void lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, const unsigned char *atm_addr, unsigned long remoteflag, unsigned int targetless_le_arp) { unsigned long flags; struct hlist_node *next; struct lec_arp_table *entry, *tmp; int i; pr_debug("%smac:%pM\n", (targetless_le_arp) ? "targetless " : "", mac_addr); spin_lock_irqsave(&priv->lec_arp_lock, flags); entry = lec_arp_find(priv, mac_addr); if (entry == NULL && targetless_le_arp) goto out; /* * LANE2: ignore targetless LE_ARPs for which * we have no entry in the cache. 7.1.30 */ if (!hlist_empty(&priv->lec_arp_empty_ones)) { hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { if (memcmp(entry->atm_addr, atm_addr, ATM_ESA_LEN) == 0) { hlist_del(&entry->next); del_timer(&entry->timer); tmp = lec_arp_find(priv, mac_addr); if (tmp) { del_timer(&tmp->timer); tmp->status = ESI_FORWARD_DIRECT; memcpy(tmp->atm_addr, atm_addr, ATM_ESA_LEN); tmp->vcc = entry->vcc; tmp->old_push = entry->old_push; tmp->last_used = jiffies; del_timer(&entry->timer); lec_arp_put(entry); entry = tmp; } else { entry->status = ESI_FORWARD_DIRECT; ether_addr_copy(entry->mac_addr, mac_addr); entry->last_used = jiffies; lec_arp_add(priv, entry); } if (remoteflag) entry->flags |= LEC_REMOTE_FLAG; else entry->flags &= ~LEC_REMOTE_FLAG; pr_debug("After update\n"); dump_arp_table(priv); goto out; } } } entry = lec_arp_find(priv, mac_addr); if (!entry) { entry = make_entry(priv, mac_addr); if (!entry) goto out; entry->status = ESI_UNKNOWN; lec_arp_add(priv, entry); /* Temporary, changes before end of function */ } memcpy(entry->atm_addr, atm_addr, ATM_ESA_LEN); del_timer(&entry->timer); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(tmp, &priv->lec_arp_tables[i], next) { if (entry != tmp && !memcmp(tmp->atm_addr, atm_addr, ATM_ESA_LEN)) { /* Vcc to this host exists */ if (tmp->status > ESI_VC_PENDING) { /* * ESI_FLUSH_PENDING, * ESI_FORWARD_DIRECT */ entry->vcc = tmp->vcc; entry->old_push = tmp->old_push; } entry->status = tmp->status; break; } } } if (remoteflag) entry->flags |= LEC_REMOTE_FLAG; else entry->flags &= ~LEC_REMOTE_FLAG; if (entry->status == ESI_ARP_PENDING || entry->status == ESI_UNKNOWN) { entry->status = ESI_VC_PENDING; send_to_lecd(priv, l_svc_setup, entry->mac_addr, atm_addr, NULL); } pr_debug("After update2\n"); dump_arp_table(priv); out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } /* * Notifies: Vcc setup ready */ static void lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, struct atm_vcc *vcc, void (*old_push) (struct atm_vcc *vcc, struct sk_buff *skb)) { unsigned long flags; struct lec_arp_table *entry; int i, found_entry = 0; spin_lock_irqsave(&priv->lec_arp_lock, flags); /* Vcc for Multicast Forward. No timer, LANEv2 7.1.20 and 2.3.5.3 */ if (ioc_data->receive == 2) { pr_debug("LEC_ARP: Attaching mcast forward\n"); #if 0 entry = lec_arp_find(priv, bus_mac); if (!entry) { pr_info("LEC_ARP: Multicast entry not found!\n"); goto out; } memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); entry->recv_vcc = vcc; entry->old_recv_push = old_push; #endif entry = make_entry(priv, bus_mac); if (entry == NULL) goto out; del_timer(&entry->timer); memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); entry->recv_vcc = vcc; entry->old_recv_push = old_push; hlist_add_head(&entry->next, &priv->mcast_fwds); goto out; } else if (ioc_data->receive == 1) { /* * Vcc which we don't want to make default vcc, * attach it anyway. */ pr_debug("LEC_ARP:Attaching data direct, not default: %*phN\n", ATM_ESA_LEN, ioc_data->atm_addr); entry = make_entry(priv, bus_mac); if (entry == NULL) goto out; memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); eth_zero_addr(entry->mac_addr); entry->recv_vcc = vcc; entry->old_recv_push = old_push; entry->status = ESI_UNKNOWN; entry->timer.expires = jiffies + priv->vcc_timeout_period; entry->timer.function = lec_arp_expire_vcc; hlist_add_head(&entry->next, &priv->lec_no_forward); add_timer(&entry->timer); dump_arp_table(priv); goto out; } pr_debug("LEC_ARP:Attaching data direct, default: %*phN\n", ATM_ESA_LEN, ioc_data->atm_addr); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (memcmp (ioc_data->atm_addr, entry->atm_addr, ATM_ESA_LEN) == 0) { pr_debug("LEC_ARP: Attaching data direct\n"); pr_debug("Currently -> Vcc: %d, Rvcc:%d\n", entry->vcc ? entry->vcc->vci : 0, entry->recv_vcc ? entry->recv_vcc-> vci : 0); found_entry = 1; del_timer(&entry->timer); entry->vcc = vcc; entry->old_push = old_push; if (entry->status == ESI_VC_PENDING) { if (priv->maximum_unknown_frame_count == 0) entry->status = ESI_FORWARD_DIRECT; else { entry->timestamp = jiffies; entry->status = ESI_FLUSH_PENDING; #if 0 send_to_lecd(priv, l_flush_xmt, NULL, entry->atm_addr, NULL); #endif } } else { /* * They were forming a connection * to us, and we to them. Our * ATM address is numerically lower * than theirs, so we make connection * we formed into default VCC (8.1.11). * Connection they made gets torn * down. This might confuse some * clients. Can be changed if * someone reports trouble... */ ; } } } } if (found_entry) { pr_debug("After vcc was added\n"); dump_arp_table(priv); goto out; } /* * Not found, snatch address from first data packet that arrives * from this vcc */ entry = make_entry(priv, bus_mac); if (!entry) goto out; entry->vcc = vcc; entry->old_push = old_push; memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); eth_zero_addr(entry->mac_addr); entry->status = ESI_UNKNOWN; hlist_add_head(&entry->next, &priv->lec_arp_empty_ones); entry->timer.expires = jiffies + priv->vcc_timeout_period; entry->timer.function = lec_arp_expire_vcc; add_timer(&entry->timer); pr_debug("After vcc was added\n"); dump_arp_table(priv); out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id) { unsigned long flags; struct lec_arp_table *entry; int i; pr_debug("%lx\n", tran_id); restart: spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (entry->flush_tran_id == tran_id && entry->status == ESI_FLUSH_PENDING) { struct sk_buff *skb; struct atm_vcc *vcc = entry->vcc; lec_arp_hold(entry); spin_unlock_irqrestore(&priv->lec_arp_lock, flags); while ((skb = skb_dequeue(&entry->tx_wait))) lec_send(vcc, skb); entry->last_used = jiffies; entry->status = ESI_FORWARD_DIRECT; lec_arp_put(entry); pr_debug("LEC_ARP: Flushed\n"); goto restart; } } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); dump_arp_table(priv); } static void lec_set_flush_tran_id(struct lec_priv *priv, const unsigned char *atm_addr, unsigned long tran_id) { unsigned long flags; struct lec_arp_table *entry; int i; spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN)) { entry->flush_tran_id = tran_id; pr_debug("Set flush transaction id to %lx for %p\n", tran_id, entry); } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc) { unsigned long flags; unsigned char mac_addr[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; struct lec_arp_table *to_add; struct lec_vcc_priv *vpriv; int err = 0; vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL); if (!vpriv) return -ENOMEM; vpriv->xoff = 0; vpriv->old_pop = vcc->pop; vcc->user_back = vpriv; vcc->pop = lec_pop; spin_lock_irqsave(&priv->lec_arp_lock, flags); to_add = make_entry(priv, mac_addr); if (!to_add) { vcc->pop = vpriv->old_pop; kfree(vpriv); err = -ENOMEM; goto out; } memcpy(to_add->atm_addr, vcc->remote.sas_addr.prv, ATM_ESA_LEN); to_add->status = ESI_FORWARD_DIRECT; to_add->flags |= LEC_PERMANENT_FLAG; to_add->vcc = vcc; to_add->old_push = vcc->push; vcc->push = lec_push; priv->mcast_vcc = vcc; lec_arp_add(priv, to_add); out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); return err; } static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) { unsigned long flags; struct hlist_node *next; struct lec_arp_table *entry; int i; pr_debug("LEC_ARP: lec_vcc_close vpi:%d vci:%d\n", vcc->vpi, vcc->vci); dump_arp_table(priv); spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { if (vcc == entry->vcc) { lec_arp_remove(priv, entry); lec_arp_put(entry); if (priv->mcast_vcc == vcc) priv->mcast_vcc = NULL; } } } hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { if (entry->vcc == vcc) { lec_arp_clear_vccs(entry); del_timer(&entry->timer); hlist_del(&entry->next); lec_arp_put(entry); } } hlist_for_each_entry_safe(entry, next, &priv->lec_no_forward, next) { if (entry->recv_vcc == vcc) { lec_arp_clear_vccs(entry); del_timer(&entry->timer); hlist_del(&entry->next); lec_arp_put(entry); } } hlist_for_each_entry_safe(entry, next, &priv->mcast_fwds, next) { if (entry->recv_vcc == vcc) { lec_arp_clear_vccs(entry); /* No timer, LANEv2 7.1.20 and 2.3.5.3 */ hlist_del(&entry->next); lec_arp_put(entry); } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); dump_arp_table(priv); } static void lec_arp_check_empties(struct lec_priv *priv, struct atm_vcc *vcc, struct sk_buff *skb) { unsigned long flags; struct hlist_node *next; struct lec_arp_table *entry, *tmp; struct lecdatahdr_8023 *hdr = (struct lecdatahdr_8023 *)skb->data; unsigned char *src = hdr->h_source; spin_lock_irqsave(&priv->lec_arp_lock, flags); hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { if (vcc == entry->vcc) { del_timer(&entry->timer); ether_addr_copy(entry->mac_addr, src); entry->status = ESI_FORWARD_DIRECT; entry->last_used = jiffies; /* We might have got an entry */ tmp = lec_arp_find(priv, src); if (tmp) { lec_arp_remove(priv, tmp); lec_arp_put(tmp); } hlist_del(&entry->next); lec_arp_add(priv, entry); goto out; } } pr_debug("LEC_ARP: Arp_check_empties: entry not found!\n"); out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } MODULE_DESCRIPTION("ATM LAN Emulation (LANE) support"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FIRMWARE_LOADER_H #define __FIRMWARE_LOADER_H #include <linux/bitops.h> #include <linux/firmware.h> #include <linux/types.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/completion.h> /** * enum fw_opt - options to control firmware loading behaviour * * @FW_OPT_UEVENT: Enables the fallback mechanism to send a kobject uevent * when the firmware is not found. Userspace is in charge to load the * firmware using the sysfs loading facility. * @FW_OPT_NOWAIT: Used to describe the firmware request is asynchronous. * @FW_OPT_USERHELPER: Enable the fallback mechanism, in case the direct * filesystem lookup fails at finding the firmware. For details refer to * firmware_fallback_sysfs(). * @FW_OPT_NO_WARN: Quiet, avoid printing warning messages. * @FW_OPT_NOCACHE: Disables firmware caching. Firmware caching is used to * cache the firmware upon suspend, so that upon resume races against the * firmware file lookup on storage is avoided. Used for calls where the * file may be too big, or where the driver takes charge of its own * firmware caching mechanism. * @FW_OPT_NOFALLBACK_SYSFS: Disable the sysfs fallback mechanism. Takes * precedence over &FW_OPT_UEVENT and &FW_OPT_USERHELPER. * @FW_OPT_FALLBACK_PLATFORM: Enable fallback to device fw copy embedded in * the platform's main firmware. If both this fallback and the sysfs * fallback are enabled, then this fallback will be tried first. * @FW_OPT_PARTIAL: Allow partial read of firmware instead of needing to read * entire file. */ enum fw_opt { FW_OPT_UEVENT = BIT(0), FW_OPT_NOWAIT = BIT(1), FW_OPT_USERHELPER = BIT(2), FW_OPT_NO_WARN = BIT(3), FW_OPT_NOCACHE = BIT(4), FW_OPT_NOFALLBACK_SYSFS = BIT(5), FW_OPT_FALLBACK_PLATFORM = BIT(6), FW_OPT_PARTIAL = BIT(7), }; enum fw_status { FW_STATUS_UNKNOWN, FW_STATUS_LOADING, FW_STATUS_DONE, FW_STATUS_ABORTED, }; /* * Concurrent request_firmware() for the same firmware need to be * serialized. struct fw_state is simple state machine which hold the * state of the firmware loading. */ struct fw_state { struct completion completion; enum fw_status status; }; struct fw_priv { struct kref ref; struct list_head list; struct firmware_cache *fwc; struct fw_state fw_st; void *data; size_t size; size_t allocated_size; size_t offset; u32 opt_flags; #ifdef CONFIG_FW_LOADER_PAGED_BUF bool is_paged_buf; struct page **pages; int nr_pages; int page_array_size; #endif #ifdef CONFIG_FW_LOADER_USER_HELPER bool need_uevent; struct list_head pending_list; #endif const char *fw_name; }; extern struct mutex fw_lock; extern struct firmware_cache fw_cache; extern bool fw_load_abort_all; static inline bool __fw_state_check(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; return fw_st->status == status; } static inline int __fw_state_wait_common(struct fw_priv *fw_priv, long timeout) { struct fw_state *fw_st = &fw_priv->fw_st; long ret; ret = wait_for_completion_killable_timeout(&fw_st->completion, timeout); if (ret != 0 && fw_st->status == FW_STATUS_ABORTED) return -ENOENT; if (!ret) return -ETIMEDOUT; return ret < 0 ? ret : 0; } static inline void __fw_state_set(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; WRITE_ONCE(fw_st->status, status); if (status == FW_STATUS_DONE || status == FW_STATUS_ABORTED) { #ifdef CONFIG_FW_LOADER_USER_HELPER /* * Doing this here ensures that the fw_priv is deleted from * the pending list in all abort/done paths. */ list_del_init(&fw_priv->pending_list); #endif complete_all(&fw_st->completion); } } static inline void fw_state_aborted(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_ABORTED); } static inline bool fw_state_is_aborted(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_ABORTED); } static inline void fw_state_start(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_LOADING); } static inline void fw_state_done(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_DONE); } static inline bool fw_state_is_done(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_DONE); } static inline bool fw_state_is_loading(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_LOADING); } int alloc_lookup_fw_priv(const char *fw_name, struct firmware_cache *fwc, struct fw_priv **fw_priv, void *dbuf, size_t size, size_t offset, u32 opt_flags); int assign_fw(struct firmware *fw, struct device *device); void free_fw_priv(struct fw_priv *fw_priv); void fw_state_init(struct fw_priv *fw_priv); #ifdef CONFIG_FW_LOADER bool firmware_is_builtin(const struct firmware *fw); bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size); #else /* module case */ static inline bool firmware_is_builtin(const struct firmware *fw) { return false; } static inline bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size) { return false; } #endif #ifdef CONFIG_FW_LOADER_PAGED_BUF void fw_free_paged_buf(struct fw_priv *fw_priv); int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed); int fw_map_paged_buf(struct fw_priv *fw_priv); bool fw_is_paged_buf(struct fw_priv *fw_priv); #else static inline void fw_free_paged_buf(struct fw_priv *fw_priv) {} static inline int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) { return -ENXIO; } static inline int fw_map_paged_buf(struct fw_priv *fw_priv) { return -ENXIO; } static inline bool fw_is_paged_buf(struct fw_priv *fw_priv) { return false; } #endif #endif /* __FIRMWARE_LOADER_H */
332 331 117 117 117 117 117 116 2436 1593 1058 4397 4384 33 1989 19 117 117 117 117 117 117 117 117 117 117 116 11 11 11 11 117 107 11 11 16 4399 14 4428 4430 51 4399 65 4395 4405 4395 4426 1901 1898 1901 95 1846 325 4140 4139 4141 4146 1605 1608 1610 20 18 46 22 31 1565 1 1 1 734 35593 10 35735 35725 35780 35777 642 38 12 12 935 95 731 35415 33993 36336 238 239 2690 389 3090 3095 3077 334 3 3 26 6 25 1 25 6 6 6 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <linux/file_ref.h> #include <net/sock.h> #include <linux/init_task.h> #include "internal.h" bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) { /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { atomic_long_set(&ref->refcnt, FILE_REF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > FILE_REF_MAXREF) atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); return false; } /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count * @cnt: Current reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) { /* Did this drop the last reference? */ if (likely(cnt == FILE_REF_NOREF)) { /* * Carefully try to set the reference count to FILE_REF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int copy_words) { unsigned int nwords = fdt_words(nfdt); bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, copy_words, nwords); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". */ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; unsigned int nr; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. Since we called only * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab * already gives BITS_PER_LONG slots), the above boils down to * 1. use the smallest power of two large enough to give us that many * slots. * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is * 256 slots (i.e. 1Kb fd array). * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there * and we are never going to be asked for 64 or less. */ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) nr = 256; else nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) { nr = round_down(sysctl_nr_open, BITS_PER_LONG); if (nr < slots_wanted) return ERR_PTR(-EMFILE); } fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (IS_ERR(new_fdt)) return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return 0; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* Can we expand? */ if (unlikely(nr >= sysctl_nr_open)) return -EMFILE; /* All good, so we try */ files->resize_in_progress = true; error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return error; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, bool set) { if (set) { __set_bit(fd, fdt->close_on_exec); } else { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (test_bit(fd, fdt->full_fds_bits)) __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * punch_hole is optional - when close_range() is asked to unshare * and close, we don't need to copy descriptors in that range, so * a smaller cloned descriptor table might suffice if the last * currently opened descriptor falls into that range. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); if (last == fdt->max_fds) return NR_OPEN_DEFAULT; if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { last = find_last_bit(fdt->open_fds, punch_hole->from); if (last == punch_hole->from) return NR_OPEN_DEFAULT; } return ALIGN(last + 1, BITS_PER_LONG); } /* * Allocate a new descriptor table and copy contents from the passed in * instance. Returns a pointer to cloned table on success, ERR_PTR() * on failure. For 'punch_hole' see sane_fdtable_size(). */ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files); if (IS_ERR(new_fdt)) { kmem_cache_free(files_cachep, newf); return ERR_CAST(new_fdt); } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; /* * We may be racing against fd allocation from other threads using this * files_struct, despite holding ->file_lock. * * alloc_fd() might have already claimed a slot, while fd_install() * did not populate it yet. Note the latter operates locklessly, so * the file can show up as we are walking the array below. * * At the same time we know no files will disappear as all other * operations take the lock. * * Instead of trying to placate userspace racing with itself, we * ref the file if we see it and mark the fd slot as unused otherwise. */ for (i = open_files; i != 0; i--) { struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; unsigned int bit; /* * Try to avoid looking at the second level bitmap */ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, start & (BITS_PER_LONG - 1)); if (bit < BITS_PER_LONG) return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ static int alloc_fd(unsigned start, unsigned end, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (unlikely(fd >= end)) goto out; if (unlikely(fd >= fdt->max_fds)) { error = expand_files(files, fd); if (error < 0) goto out; goto repeat; } if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); return error; } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /** * fd_install - install a file pointer in the fd array * @fd: file descriptor to install the file in * @file: the file to install * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) return; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); /** * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; lockdep_assert_held(&files->file_lock); if (fd >= fdt->max_fds) return NULL; fd = array_index_nospec(fd, fdt->max_fds); file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); } return file; } int close_fd(unsigned fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * * Returns: Last valid index into fdtable. */ static inline unsigned last_fd(struct fdtable *fdt) { return fdt->max_fds - 1; } static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); max_fd = min(last_fd(fdt), max_fd); if (fd <= max_fd) bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { struct file *file; unsigned n; spin_lock(&files->file_lock); n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } else if (need_resched()) { spin_unlock(&files->file_lock); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } /** * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { struct fd_range range = {fd, max_fd}, *punch_hole = &range; /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ if (flags & CLOSE_RANGE_CLOEXEC) punch_hole = NULL; fds = dup_fd(cur_fds, punch_hole); if (IS_ERR(fds)) return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else __range_close(cur_fds, fd, max_fd); if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /** * file_close_fd - return file associated with fd * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static struct file *__get_file_rcu(struct file __rcu **f) { struct file __rcu *file; struct file __rcu *file_reloaded; struct file __rcu *file_reloaded_cmp; file = rcu_dereference_raw(*f); if (!file) return NULL; if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); /* * Ensure that all accesses have a dependency on the load from * rcu_dereference_raw() above so we get correct ordering * between reuse/allocation and the pointer check below. */ file_reloaded_cmp = file_reloaded; OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* * file_ref_get() above provided a full memory barrier when we * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still * matches the current file, we know we have successfully * acquired a reference to the right file. * * If the pointers don't match the file has been reallocated by * SLAB_TYPESAFE_BY_RCU. */ if (file == file_reloaded_cmp) return file_reloaded; fput(file); return ERR_PTR(-EAGAIN); } /** * get_file_rcu - try go get a reference to a file under rcu * @f: the file to get a reference on * * This function tries to get a reference on @f carefully verifying that * @f hasn't been reused. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_rcu(struct file __rcu **f) { for (;;) { struct file __rcu *file; file = __get_file_rcu(f); if (!IS_ERR(file)) return file; } } EXPORT_SYMBOL_GPL(get_file_rcu); /** * get_file_active - try go get a reference to a file * @f: the file to get a reference on * * In contast to get_file_rcu() the pointer itself isn't part of the * reference counting. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_active(struct file **f) { struct file __rcu *file; rcu_read_lock(); file = __get_file_rcu(f); rcu_read_unlock(); if (IS_ERR(file)) file = NULL; return file; } EXPORT_SYMBOL_GPL(get_file_active); static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; unsigned long nospec_mask; /* Mask is a 0 for invalid fd's, ~0 for valid ones */ nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* * fdentry points to the 'fd' offset, or fdt->fd[0]. * Loading from fdt->fd[0] is always safe, because the * array always exists. */ fdentry = fdt->fd + (fd & nospec_mask); /* Do the load, then mask any invalid result */ file = rcu_dereference_raw(*fdentry); file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; /* * Ok, we have a file pointer that was valid at * some point, but it might have become stale since. * * We need to confirm it by incrementing the refcount * and then check the lookup again. * * file_ref_get() gives us a full memory barrier. We * only really need an 'acquire' one to protect the * loads below, but we don't have that. */ if (unlikely(!file_ref_get(&file->f_ref))) continue; /* * Such a race can take two forms: * * (a) the file ref already went down to zero and the * file hasn't been reused yet or the file count * isn't zero but the file has already been reused. * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our ref and try again. */ if (unlikely(file != rcu_dereference_raw(*fdentry)) || unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* * This isn't the file we're looking for or we're not * allowed to get a reference to it. */ if (unlikely(file->f_mode & mask)) { fput(file); return NULL; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask) { return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0); task_unlock(task); return file; } struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; unsigned int fd = *ret_fd; struct file *file = NULL; task_lock(task); files = task->files; if (files) { rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. * * (As an exception to rule 2, you can call filp_close between fget_light and * fput_light provided that you capture a real refcount with get_file before * the call to filp_close, and ensure that this real refcount is fput *after* * the fput_light call.) * * See also the documentation in rust/kernel/file.rs. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; /* * If another thread is concurrently calling close_fd() followed * by put_files_struct(), we must not observe the old table * entry combined with the new refcount - otherwise we could * return a file that is concurrently being freed. * * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return EMPTY_FD; return BORROWED_FD(file); } else { file = __fget_files(files, fd, mask); if (!file) return EMPTY_FD; return CLONED_FD(file); } } struct fd fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(fdget); struct fd fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } /* * Try to avoid f_pos locking. We only need it if the * file is marked for FMODE_ATOMIC_POS, and it can be * accessed multiple ways. * * Always do it for directories, because pidfd_getfd() * can make a file accessible even if it otherwise would * not be, and for directories this is a correctness * issue, not a "POSIX requirement". */ static inline bool file_needs_f_pos_lock(struct file *file) { if (!(file->f_mode & FMODE_ATOMIC_POS)) return false; if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) return true; if (file->f_op->iterate_shared) return true; return false; } bool file_seek_cur_needs_f_lock(struct file *file) { if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) return false; VFS_WARN_ON_ONCE((file_count(file) > 1) && !mutex_is_locked(&file->f_pos_lock)); return true; } struct fd fdget_pos(unsigned int fd) { struct fd f = fdget(fd); struct file *file = fd_file(f); if (likely(file) && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } return f; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { bool res; rcu_read_lock(); res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * dup2() is expected to close the file installed in the target fd slot * (if any). However, userspace hand-picking a fd may be racing against * its own threads which happened to allocate it in open() et al but did * not populate it yet. * * Broadly speaking we may be racing against the following: * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL * file = hard_work_goes_here(); * fd_install(fd, file); // only now ->fd[fd] == file * * It is an invariant that a successfully allocated fd has a NULL entry * in the array until the matching fd_install(). * * If we fit the window, we have the fd to populate, yet no target file * to close. Trying to ignore it and install our new file would violate * the invariant and make fd_install() overwrite our file. * * Things can be done(tm) to handle this. However, the issue does not * concern legitimate programs and we only need to make sure the kernel * does not trip over it. * * The simplest way out is to return an error if we find ourselves here. * * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; return do_dup2(files, file, fd, flags); out_unlock: spin_unlock(&files->file_lock); return err; } /** * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; error = security_file_receive(file); if (error) return error; new_fd = get_unused_fd_flags(o_flags); if (new_fd < 0) return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { put_unused_fd(new_fd); return error; } } fd_install(new_fd, get_file(file)); __receive_sock(file); return new_fd; } EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; error = replace_fd(new_fd, file, o_flags); if (error) return error; __receive_sock(file); return new_fd; } static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; struct file *f; int retval = oldfd; rcu_read_lock(); f = __fget_files_rcu(files, oldfd, 0); if (!f) retval = -EBADF; rcu_read_unlock(); if (f) fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; if (from >= nofile) return -EINVAL; err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 // SPDX-License-Identifier: GPL-2.0 /* net/atm/resources.c - Statically allocated resources */ /* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ /* Fixes * Arnaldo Carvalho de Melo <acme@conectiva.com.br> * 2002/01 - don't free the whole struct sock on sk->destruct time, * use the default destruct function initialized by sock_init_data */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/ctype.h> #include <linux/string.h> #include <linux/atmdev.h> #include <linux/sonet.h> #include <linux/kernel.h> /* for barrier */ #include <linux/module.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/delay.h> #include <linux/mutex.h> #include <linux/slab.h> #include <net/sock.h> /* for struct sock */ #include "common.h" #include "resources.h" #include "addr.h" LIST_HEAD(atm_devs); DEFINE_MUTEX(atm_dev_mutex); static struct atm_dev *__alloc_atm_dev(const char *type) { struct atm_dev *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; dev->type = type; dev->signal = ATM_PHY_SIG_UNKNOWN; dev->link_rate = ATM_OC3_PCR; spin_lock_init(&dev->lock); INIT_LIST_HEAD(&dev->local); INIT_LIST_HEAD(&dev->lecs); return dev; } static struct atm_dev *__atm_dev_lookup(int number) { struct atm_dev *dev; list_for_each_entry(dev, &atm_devs, dev_list) { if (dev->number == number) { atm_dev_hold(dev); return dev; } } return NULL; } struct atm_dev *atm_dev_lookup(int number) { struct atm_dev *dev; mutex_lock(&atm_dev_mutex); dev = __atm_dev_lookup(number); mutex_unlock(&atm_dev_mutex); return dev; } EXPORT_SYMBOL(atm_dev_lookup); struct atm_dev *atm_dev_register(const char *type, struct device *parent, const struct atmdev_ops *ops, int number, unsigned long *flags) { struct atm_dev *dev, *inuse; dev = __alloc_atm_dev(type); if (!dev) { pr_err("no space for dev %s\n", type); return NULL; } mutex_lock(&atm_dev_mutex); if (number != -1) { inuse = __atm_dev_lookup(number); if (inuse) { atm_dev_put(inuse); mutex_unlock(&atm_dev_mutex); kfree(dev); return NULL; } dev->number = number; } else { dev->number = 0; while ((inuse = __atm_dev_lookup(dev->number))) { atm_dev_put(inuse); dev->number++; } } dev->ops = ops; if (flags) dev->flags = *flags; else memset(&dev->flags, 0, sizeof(dev->flags)); memset(&dev->stats, 0, sizeof(dev->stats)); refcount_set(&dev->refcnt, 1); if (atm_proc_dev_register(dev) < 0) { pr_err("atm_proc_dev_register failed for dev %s\n", type); goto out_fail; } if (atm_register_sysfs(dev, parent) < 0) { pr_err("atm_register_sysfs failed for dev %s\n", type); atm_proc_dev_deregister(dev); goto out_fail; } list_add_tail(&dev->dev_list, &atm_devs); out: mutex_unlock(&atm_dev_mutex); return dev; out_fail: kfree(dev); dev = NULL; goto out; } EXPORT_SYMBOL(atm_dev_register); void atm_dev_deregister(struct atm_dev *dev) { BUG_ON(test_bit(ATM_DF_REMOVED, &dev->flags)); set_bit(ATM_DF_REMOVED, &dev->flags); /* * if we remove current device from atm_devs list, new device * with same number can appear, such we need deregister proc, * release async all vccs and remove them from vccs list too */ mutex_lock(&atm_dev_mutex); list_del(&dev->dev_list); mutex_unlock(&atm_dev_mutex); atm_dev_release_vccs(dev); atm_unregister_sysfs(dev); atm_proc_dev_deregister(dev); atm_dev_put(dev); } EXPORT_SYMBOL(atm_dev_deregister); static void copy_aal_stats(struct k_atm_aal_stats *from, struct atm_aal_stats *to) { #define __HANDLE_ITEM(i) to->i = atomic_read(&from->i) __AAL_STAT_ITEMS #undef __HANDLE_ITEM } static void subtract_aal_stats(struct k_atm_aal_stats *from, struct atm_aal_stats *to) { #define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i) __AAL_STAT_ITEMS #undef __HANDLE_ITEM } static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg, int zero) { struct atm_dev_stats tmp; int error = 0; copy_aal_stats(&dev->stats.aal0, &tmp.aal0); copy_aal_stats(&dev->stats.aal34, &tmp.aal34); copy_aal_stats(&dev->stats.aal5, &tmp.aal5); if (arg) error = copy_to_user(arg, &tmp, sizeof(tmp)); if (zero && !error) { subtract_aal_stats(&dev->stats.aal0, &tmp.aal0); subtract_aal_stats(&dev->stats.aal34, &tmp.aal34); subtract_aal_stats(&dev->stats.aal5, &tmp.aal5); } return error ? -EFAULT : 0; } int atm_getnames(void __user *buf, int __user *iobuf_len) { int error, len, size = 0; struct atm_dev *dev; struct list_head *p; int *tmp_buf, *tmp_p; if (get_user(len, iobuf_len)) return -EFAULT; mutex_lock(&atm_dev_mutex); list_for_each(p, &atm_devs) size += sizeof(int); if (size > len) { mutex_unlock(&atm_dev_mutex); return -E2BIG; } tmp_buf = kmalloc(size, GFP_ATOMIC); if (!tmp_buf) { mutex_unlock(&atm_dev_mutex); return -ENOMEM; } tmp_p = tmp_buf; list_for_each_entry(dev, &atm_devs, dev_list) { *tmp_p++ = dev->number; } mutex_unlock(&atm_dev_mutex); error = ((copy_to_user(buf, tmp_buf, size)) || put_user(size, iobuf_len)) ? -EFAULT : 0; kfree(tmp_buf); return error; } int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len, int number, int compat) { int error, len, size = 0; struct atm_dev *dev; if (get_user(len, sioc_len)) return -EFAULT; dev = try_then_request_module(atm_dev_lookup(number), "atm-device-%d", number); if (!dev) return -ENODEV; switch (cmd) { case ATM_GETTYPE: size = strlen(dev->type) + 1; if (copy_to_user(buf, dev->type, size)) { error = -EFAULT; goto done; } break; case ATM_GETESI: size = ESI_LEN; if (copy_to_user(buf, dev->esi, size)) { error = -EFAULT; goto done; } break; case ATM_SETESI: { int i; for (i = 0; i < ESI_LEN; i++) if (dev->esi[i]) { error = -EEXIST; goto done; } } fallthrough; case ATM_SETESIF: { unsigned char esi[ESI_LEN]; if (!capable(CAP_NET_ADMIN)) { error = -EPERM; goto done; } if (copy_from_user(esi, buf, ESI_LEN)) { error = -EFAULT; goto done; } memcpy(dev->esi, esi, ESI_LEN); error = ESI_LEN; goto done; } case ATM_GETSTATZ: if (!capable(CAP_NET_ADMIN)) { error = -EPERM; goto done; } fallthrough; case ATM_GETSTAT: size = sizeof(struct atm_dev_stats); error = fetch_stats(dev, buf, cmd == ATM_GETSTATZ); if (error) goto done; break; case ATM_GETCIRANGE: size = sizeof(struct atm_cirange); if (copy_to_user(buf, &dev->ci_range, size)) { error = -EFAULT; goto done; } break; case ATM_GETLINKRATE: size = sizeof(int); if (copy_to_user(buf, &dev->link_rate, size)) { error = -EFAULT; goto done; } break; case ATM_RSTADDR: if (!capable(CAP_NET_ADMIN)) { error = -EPERM; goto done; } atm_reset_addr(dev, ATM_ADDR_LOCAL); break; case ATM_ADDADDR: case ATM_DELADDR: case ATM_ADDLECSADDR: case ATM_DELLECSADDR: { struct sockaddr_atmsvc addr; if (!capable(CAP_NET_ADMIN)) { error = -EPERM; goto done; } if (copy_from_user(&addr, buf, sizeof(addr))) { error = -EFAULT; goto done; } if (cmd == ATM_ADDADDR || cmd == ATM_ADDLECSADDR) error = atm_add_addr(dev, &addr, (cmd == ATM_ADDADDR ? ATM_ADDR_LOCAL : ATM_ADDR_LECS)); else error = atm_del_addr(dev, &addr, (cmd == ATM_DELADDR ? ATM_ADDR_LOCAL : ATM_ADDR_LECS)); goto done; } case ATM_GETADDR: case ATM_GETLECSADDR: error = atm_get_addr(dev, buf, len, (cmd == ATM_GETADDR ? ATM_ADDR_LOCAL : ATM_ADDR_LECS)); if (error < 0) goto done; size = error; /* may return 0, but later on size == 0 means "don't write the length" */ error = put_user(size, sioc_len) ? -EFAULT : 0; goto done; case ATM_SETLOOP: if (__ATM_LM_XTRMT((int) (unsigned long) buf) && __ATM_LM_XTLOC((int) (unsigned long) buf) > __ATM_LM_XTRMT((int) (unsigned long) buf)) { error = -EINVAL; goto done; } fallthrough; case ATM_SETCIRANGE: case SONET_GETSTATZ: case SONET_SETDIAG: case SONET_CLRDIAG: case SONET_SETFRAMING: if (!capable(CAP_NET_ADMIN)) { error = -EPERM; goto done; } fallthrough; default: if (IS_ENABLED(CONFIG_COMPAT) && compat) { #ifdef CONFIG_COMPAT if (!dev->ops->compat_ioctl) { error = -EINVAL; goto done; } size = dev->ops->compat_ioctl(dev, cmd, buf); #endif } else { if (!dev->ops->ioctl) { error = -EINVAL; goto done; } size = dev->ops->ioctl(dev, cmd, buf); } if (size < 0) { error = (size == -ENOIOCTLCMD ? -ENOTTY : size); goto done; } } if (size) error = put_user(size, sioc_len) ? -EFAULT : 0; else error = 0; done: atm_dev_put(dev); return error; } #ifdef CONFIG_PROC_FS void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos) { mutex_lock(&atm_dev_mutex); return seq_list_start_head(&atm_devs, *pos); } void atm_dev_seq_stop(struct seq_file *seq, void *v) { mutex_unlock(&atm_dev_mutex); } void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &atm_devs, pos); } #endif
8 2 1 7 6 3 2 1 2 5 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/btf.h> #include <linux/filter.h> #include <linux/slab.h> #include <linux/numa.h> #include <linux/seq_file.h> #include <linux/refcount.h> #include <linux/mutex.h> #include <linux/btf_ids.h> #include <linux/rcupdate_wait.h> #include <linux/poll.h> struct bpf_struct_ops_value { struct bpf_struct_ops_common_value common; char data[] ____cacheline_aligned_in_smp; }; #define MAX_TRAMP_IMAGE_PAGES 8 struct bpf_struct_ops_map { struct bpf_map map; const struct bpf_struct_ops_desc *st_ops_desc; /* protect map_update */ struct mutex lock; /* link has all the bpf_links that is populated * to the func ptr of the kernel's struct * (in kvalue.data). */ struct bpf_link **links; /* ksyms for bpf trampolines */ struct bpf_ksym **ksyms; u32 funcs_cnt; u32 image_pages_cnt; /* image_pages is an array of pages that has all the trampolines * that stores the func args before calling the bpf_prog. */ void *image_pages[MAX_TRAMP_IMAGE_PAGES]; /* The owner moduler's btf. */ struct btf *btf; /* uvalue->data stores the kernel struct * (e.g. tcp_congestion_ops) that is more useful * to userspace than the kvalue. For example, * the bpf_prog's id is stored instead of the kernel * address of a func ptr. */ struct bpf_struct_ops_value *uvalue; /* kvalue.data stores the actual kernel's struct * (e.g. tcp_congestion_ops) that will be * registered to the kernel subsystem. */ struct bpf_struct_ops_value kvalue; }; struct bpf_struct_ops_link { struct bpf_link link; struct bpf_map __rcu *map; wait_queue_head_t wait_hup; }; static DEFINE_MUTEX(update_mutex); #define VALUE_PREFIX "bpf_struct_ops_" #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { }; const struct bpf_prog_ops bpf_struct_ops_prog_ops = { #ifdef CONFIG_NET .test_run = bpf_struct_ops_test_run, #endif }; BTF_ID_LIST(st_ops_ids) BTF_ID(struct, module) BTF_ID(struct, bpf_struct_ops_common_value) enum { IDX_MODULE_ID, IDX_ST_OPS_COMMON_VALUE_ID, }; extern struct btf *btf_vmlinux; static bool is_valid_value_type(struct btf *btf, s32 value_id, const struct btf_type *type, const char *value_name) { const struct btf_type *common_value_type; const struct btf_member *member; const struct btf_type *vt, *mt; vt = btf_type_by_id(btf, value_id); if (btf_vlen(vt) != 2) { pr_warn("The number of %s's members should be 2, but we get %d\n", value_name, btf_vlen(vt)); return false; } member = btf_type_member(vt); mt = btf_type_by_id(btf, member->type); common_value_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]); if (mt != common_value_type) { pr_warn("The first member of %s should be bpf_struct_ops_common_value\n", value_name); return false; } member++; mt = btf_type_by_id(btf, member->type); if (mt != type) { pr_warn("The second member of %s should be %s\n", value_name, btf_name_by_offset(btf, type->name_off)); return false; } return true; } static void *bpf_struct_ops_image_alloc(void) { void *image; int err; err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) return ERR_PTR(err); image = arch_alloc_bpf_trampoline(PAGE_SIZE); if (!image) { bpf_jit_uncharge_modmem(PAGE_SIZE); return ERR_PTR(-ENOMEM); } return image; } void bpf_struct_ops_image_free(void *image) { if (image) { arch_free_bpf_trampoline(image, PAGE_SIZE); bpf_jit_uncharge_modmem(PAGE_SIZE); } } #define MAYBE_NULL_SUFFIX "__nullable" #define REFCOUNTED_SUFFIX "__ref" /* Prepare argument info for every nullable argument of a member of a * struct_ops type. * * Initialize a struct bpf_struct_ops_arg_info according to type info of * the arguments of a stub function. (Check kCFI for more information about * stub functions.) * * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info * to provide an array of struct bpf_ctx_arg_aux, which in turn provides * the information that used by the verifier to check the arguments of the * BPF struct_ops program assigned to the member. Here, we only care about * the arguments that are marked as __nullable. * * The array of struct bpf_ctx_arg_aux is eventually assigned to * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the * verifier. (See check_struct_ops_btf_id()) * * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If * fails, it will be kept untouched. */ static int prepare_arg_info(struct btf *btf, const char *st_ops_name, const char *member_name, const struct btf_type *func_proto, void *stub_func_addr, struct bpf_struct_ops_arg_info *arg_info) { const struct btf_type *stub_func_proto, *pointed_type; bool is_nullable = false, is_refcounted = false; const struct btf_param *stub_args, *args; struct bpf_ctx_arg_aux *info, *info_buf; u32 nargs, arg_no, info_cnt = 0; char ksym[KSYM_SYMBOL_LEN]; const char *stub_fname; const char *suffix; s32 stub_func_id; u32 arg_btf_id; int offset; stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym); if (!stub_fname) { pr_warn("Cannot find the stub function name for the %s in struct %s\n", member_name, st_ops_name); return -ENOENT; } stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC); if (stub_func_id < 0) { pr_warn("Cannot find the stub function %s in btf\n", stub_fname); return -ENOENT; } stub_func_proto = btf_type_by_id(btf, stub_func_id); stub_func_proto = btf_type_by_id(btf, stub_func_proto->type); /* Check if the number of arguments of the stub function is the same * as the number of arguments of the function pointer. */ nargs = btf_type_vlen(func_proto); if (nargs != btf_type_vlen(stub_func_proto)) { pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n", stub_fname, member_name, st_ops_name); return -EINVAL; } if (!nargs) return 0; args = btf_params(func_proto); stub_args = btf_params(stub_func_proto); info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL); if (!info_buf) return -ENOMEM; /* Prepare info for every nullable argument */ info = info_buf; for (arg_no = 0; arg_no < nargs; arg_no++) { /* Skip arguments that is not suffixed with * "__nullable or __ref". */ is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no], MAYBE_NULL_SUFFIX); is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no], REFCOUNTED_SUFFIX); if (is_nullable) suffix = MAYBE_NULL_SUFFIX; else if (is_refcounted) suffix = REFCOUNTED_SUFFIX; else continue; /* Should be a pointer to struct */ pointed_type = btf_type_resolve_ptr(btf, args[arg_no].type, &arg_btf_id); if (!pointed_type || !btf_type_is_struct(pointed_type)) { pr_warn("stub function %s has %s tagging to an unsupported type\n", stub_fname, suffix); goto err_out; } offset = btf_ctx_arg_offset(btf, func_proto, arg_no); if (offset < 0) { pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n", stub_fname, arg_no); goto err_out; } if (args[arg_no].type != stub_args[arg_no].type) { pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n", arg_no, stub_fname); goto err_out; } /* Fill the information of the new argument */ info->btf_id = arg_btf_id; info->btf = btf; info->offset = offset; if (is_nullable) { info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; } else if (is_refcounted) { info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID; info->refcounted = true; } info++; info_cnt++; } if (info_cnt) { arg_info->info = info_buf; arg_info->cnt = info_cnt; } else { kfree(info_buf); } return 0; err_out: kfree(info_buf); return -EINVAL; } /* Clean up the arg_info in a struct bpf_struct_ops_desc. */ void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc) { struct bpf_struct_ops_arg_info *arg_info; int i; arg_info = st_ops_desc->arg_info; for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++) kfree(arg_info[i].info); kfree(arg_info); } static bool is_module_member(const struct btf *btf, u32 id) { const struct btf_type *t; t = btf_type_resolve_ptr(btf, id, NULL); if (!t) return false; if (!__btf_type_is_struct(t) && !btf_type_is_fwd(t)) return false; return !strcmp(btf_name_by_offset(btf, t->name_off), "module"); } int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) { void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); return func_ptr ? 0 : -ENOTSUPP; } int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log) { struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; struct bpf_struct_ops_arg_info *arg_info; const struct btf_member *member; const struct btf_type *t; s32 type_id, value_id; char value_name[128]; const char *mname; int i, err; if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= sizeof(value_name)) { pr_warn("struct_ops name %s is too long\n", st_ops->name); return -EINVAL; } sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); if (!st_ops->cfi_stubs) { pr_warn("struct_ops for %s has no cfi_stubs\n", st_ops->name); return -EINVAL; } type_id = btf_find_by_name_kind(btf, st_ops->name, BTF_KIND_STRUCT); if (type_id < 0) { pr_warn("Cannot find struct %s in %s\n", st_ops->name, btf_get_name(btf)); return -EINVAL; } t = btf_type_by_id(btf, type_id); if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { pr_warn("Cannot support #%u members in struct %s\n", btf_type_vlen(t), st_ops->name); return -EINVAL; } value_id = btf_find_by_name_kind(btf, value_name, BTF_KIND_STRUCT); if (value_id < 0) { pr_warn("Cannot find struct %s in %s\n", value_name, btf_get_name(btf)); return -EINVAL; } if (!is_valid_value_type(btf, value_id, t, value_name)) return -EINVAL; arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info), GFP_KERNEL); if (!arg_info) return -ENOMEM; st_ops_desc->arg_info = arg_info; st_ops_desc->type = t; st_ops_desc->type_id = type_id; st_ops_desc->value_id = value_id; st_ops_desc->value_type = btf_type_by_id(btf, value_id); for_each_member(i, t, member) { const struct btf_type *func_proto, *ret_type; void **stub_func_addr; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; mname = btf_name_by_offset(btf, member->name_off); if (!*mname) { pr_warn("anon member in struct %s is not supported\n", st_ops->name); err = -EOPNOTSUPP; goto errout; } if (__btf_member_bitfield_size(t, member)) { pr_warn("bit field member %s in struct %s is not supported\n", mname, st_ops->name); err = -EOPNOTSUPP; goto errout; } if (!st_ops_ids[IDX_MODULE_ID] && is_module_member(btf, member->type)) { pr_warn("'struct module' btf id not found. Is CONFIG_MODULES enabled? bpf_struct_ops '%s' needs module support.\n", st_ops->name); err = -EOPNOTSUPP; goto errout; } func_proto = btf_type_resolve_func_ptr(btf, member->type, NULL); /* The member is not a function pointer or * the function pointer is not supported. */ if (!func_proto || bpf_struct_ops_supported(st_ops, moff)) continue; if (func_proto->type) { ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL); if (ret_type && !__btf_type_is_struct(ret_type)) { pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n", mname, st_ops->name); err = -EOPNOTSUPP; goto errout; } } if (btf_distill_func_proto(log, btf, func_proto, mname, &st_ops->func_models[i])) { pr_warn("Error in parsing func ptr %s in struct %s\n", mname, st_ops->name); err = -EINVAL; goto errout; } stub_func_addr = *(void **)(st_ops->cfi_stubs + moff); err = prepare_arg_info(btf, st_ops->name, mname, func_proto, stub_func_addr, arg_info + i); if (err) goto errout; } if (st_ops->init(btf)) { pr_warn("Error in init bpf_struct_ops %s\n", st_ops->name); err = -EINVAL; goto errout; } return 0; errout: bpf_struct_ops_desc_release(st_ops_desc); return err; } static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { if (key && *(u32 *)key == 0) return -ENOENT; *(u32 *)next_key = 0; return 0; } int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; struct bpf_struct_ops_value *uvalue, *kvalue; enum bpf_struct_ops_state state; s64 refcnt; if (unlikely(*(u32 *)key != 0)) return -ENOENT; kvalue = &st_map->kvalue; /* Pair with smp_store_release() during map_update */ state = smp_load_acquire(&kvalue->common.state); if (state == BPF_STRUCT_OPS_STATE_INIT) { memset(value, 0, map->value_size); return 0; } /* No lock is needed. state and refcnt do not need * to be updated together under atomic context. */ uvalue = value; memcpy(uvalue, st_map->uvalue, map->value_size); uvalue->common.state = state; /* This value offers the user space a general estimate of how * many sockets are still utilizing this struct_ops for TCP * congestion control. The number might not be exact, but it * should sufficiently meet our present goals. */ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0)); return 0; } static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-EINVAL); } static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) { u32 i; for (i = 0; i < st_map->funcs_cnt; i++) { if (!st_map->links[i]) break; bpf_link_put(st_map->links[i]); st_map->links[i] = NULL; } } static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) { int i; for (i = 0; i < st_map->image_pages_cnt; i++) bpf_struct_ops_image_free(st_map->image_pages[i]); st_map->image_pages_cnt = 0; } static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data) { const struct btf_member *member; u32 i, moff, msize, prev_mend = 0; const struct btf_type *mtype; for_each_member(i, t, member) { moff = __btf_member_bit_offset(t, member) / 8; if (moff > prev_mend && memchr_inv(data + prev_mend, 0, moff - prev_mend)) return -EINVAL; mtype = btf_type_by_id(btf, member->type); mtype = btf_resolve_size(btf, mtype, &msize); if (IS_ERR(mtype)) return PTR_ERR(mtype); prev_mend = moff + msize; } if (t->size > prev_mend && memchr_inv(data + prev_mend, 0, t->size - prev_mend)) return -EINVAL; return 0; } static void bpf_struct_ops_link_release(struct bpf_link *link) { } static void bpf_struct_ops_link_dealloc(struct bpf_link *link) { struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); kfree(tlink); } const struct bpf_link_ops bpf_struct_ops_link_lops = { .release = bpf_struct_ops_link_release, .dealloc = bpf_struct_ops_link_dealloc, }; int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, void *stub_func, void **_image, u32 *_image_off, bool allow_alloc) { u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT; void *image = *_image; int size; tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].nr_links = 1; if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); if (size <= 0) return size ? : -EFAULT; /* Allocate image buffer if necessary */ if (!image || size > PAGE_SIZE - image_off) { if (!allow_alloc) return -E2BIG; image = bpf_struct_ops_image_alloc(); if (IS_ERR(image)) return PTR_ERR(image); image_off = 0; } size = arch_prepare_bpf_trampoline(NULL, image + image_off, image + image_off + size, model, flags, tlinks, stub_func); if (size <= 0) { if (image != *_image) bpf_struct_ops_image_free(image); return size ? : -EFAULT; } *_image = image; *_image_off = image_off + size; return 0; } static void bpf_struct_ops_ksym_init(const char *tname, const char *mname, void *image, unsigned int size, struct bpf_ksym *ksym) { snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname); INIT_LIST_HEAD_RCU(&ksym->lnode); bpf_image_ksym_init(image, size, ksym); } static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map) { u32 i; for (i = 0; i < st_map->funcs_cnt; i++) { if (!st_map->ksyms[i]) break; bpf_image_ksym_add(st_map->ksyms[i]); } } static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map) { u32 i; for (i = 0; i < st_map->funcs_cnt; i++) { if (!st_map->ksyms[i]) break; bpf_image_ksym_del(st_map->ksyms[i]); } } static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map) { u32 i; for (i = 0; i < st_map->funcs_cnt; i++) { if (!st_map->ksyms[i]) break; kfree(st_map->ksyms[i]); st_map->ksyms[i] = NULL; } } static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc; const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_type *module_type; const struct btf_member *member; const struct btf_type *t = st_ops_desc->type; struct bpf_tramp_links *tlinks; void *udata, *kdata; int prog_fd, err; u32 i, trampoline_start, image_off = 0; void *cur_image = NULL, *image = NULL; struct bpf_link **plink; struct bpf_ksym **pksym; const char *tname, *mname; if (flags) return -EINVAL; if (*(u32 *)key != 0) return -E2BIG; err = check_zero_holes(st_map->btf, st_ops_desc->value_type, value); if (err) return err; uvalue = value; err = check_zero_holes(st_map->btf, t, uvalue->data); if (err) return err; if (uvalue->common.state || refcount_read(&uvalue->common.refcnt)) return -EINVAL; tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); if (!tlinks) return -ENOMEM; uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; mutex_lock(&st_map->lock); if (kvalue->common.state != BPF_STRUCT_OPS_STATE_INIT) { err = -EBUSY; goto unlock; } memcpy(uvalue, value, map->value_size); udata = &uvalue->data; kdata = &kvalue->data; plink = st_map->links; pksym = st_map->ksyms; tname = btf_name_by_offset(st_map->btf, t->name_off); module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; struct bpf_tramp_link *link; struct bpf_ksym *ksym; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; mname = btf_name_by_offset(st_map->btf, member->name_off); ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); if (ptype == module_type) { if (*(void **)(udata + moff)) goto reset_unlock; *(void **)(kdata + moff) = BPF_MODULE_OWNER; continue; } err = st_ops->init_member(t, member, kdata, udata); if (err < 0) goto reset_unlock; /* The ->init_member() has handled this member */ if (err > 0) continue; /* If st_ops->init_member does not handle it, * we will only handle func ptrs and zero-ed members * here. Reject everything else. */ /* All non func ptr member must be 0 */ if (!ptype || !btf_type_is_func_proto(ptype)) { u32 msize; mtype = btf_type_by_id(st_map->btf, member->type); mtype = btf_resolve_size(st_map->btf, mtype, &msize); if (IS_ERR(mtype)) { err = PTR_ERR(mtype); goto reset_unlock; } if (memchr_inv(udata + moff, 0, msize)) { err = -EINVAL; goto reset_unlock; } continue; } prog_fd = (int)(*(unsigned long *)(udata + moff)); /* Similar check as the attr->attach_prog_fd */ if (!prog_fd) continue; prog = bpf_prog_get(prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto reset_unlock; } if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || prog->aux->attach_btf_id != st_ops_desc->type_id || prog->expected_attach_type != i) { bpf_prog_put(prog); err = -EINVAL; goto reset_unlock; } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { bpf_prog_put(prog); err = -ENOMEM; goto reset_unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog); *plink++ = &link->link; ksym = kzalloc(sizeof(*ksym), GFP_USER); if (!ksym) { err = -ENOMEM; goto reset_unlock; } *pksym++ = ksym; trampoline_start = image_off; err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], *(void **)(st_ops->cfi_stubs + moff), &image, &image_off, st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES); if (err) goto reset_unlock; if (cur_image != image) { st_map->image_pages[st_map->image_pages_cnt++] = image; cur_image = image; trampoline_start = 0; } *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset(); /* put prog_id to udata */ *(unsigned long *)(udata + moff) = prog->aux->id; /* init ksym for this trampoline */ bpf_struct_ops_ksym_init(tname, mname, image + trampoline_start, image_off - trampoline_start, ksym); } if (st_ops->validate) { err = st_ops->validate(kdata); if (err) goto reset_unlock; } for (i = 0; i < st_map->image_pages_cnt; i++) { err = arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); if (err) goto reset_unlock; } if (st_map->map.map_flags & BPF_F_LINK) { err = 0; /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem(). */ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY); goto unlock; } err = st_ops->reg(kdata, NULL); if (likely(!err)) { /* This refcnt increment on the map here after * 'st_ops->reg()' is secure since the state of the * map must be set to INIT at this moment, and thus * bpf_struct_ops_map_delete_elem() can't unregister * or transition it to TOBEFREE concurrently. */ bpf_map_inc(map); /* Pair with smp_load_acquire() during lookup_elem(). * It ensures the above udata updates (e.g. prog->aux->id) * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. */ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE); goto unlock; } /* Error during st_ops->reg(). Can happen if this struct_ops needs to be * verified as a whole, after all init_member() calls. Can also happen if * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ reset_unlock: bpf_struct_ops_map_free_ksyms(st_map); bpf_struct_ops_map_free_image(st_map); bpf_struct_ops_map_put_progs(st_map); memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: kfree(tlinks); mutex_unlock(&st_map->lock); if (!err) bpf_struct_ops_map_add_ksyms(st_map); return err; } static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) { enum bpf_struct_ops_state prev_state; struct bpf_struct_ops_map *st_map; st_map = (struct bpf_struct_ops_map *)map; if (st_map->map.map_flags & BPF_F_LINK) return -EOPNOTSUPP; prev_state = cmpxchg(&st_map->kvalue.common.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); switch (prev_state) { case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL); bpf_map_put(map); return 0; case BPF_STRUCT_OPS_STATE_TOBEFREE: return -EINPROGRESS; case BPF_STRUCT_OPS_STATE_INIT: return -ENOENT; default: WARN_ON_ONCE(1); /* Should never happen. Treat it as not found. */ return -ENOENT; } } static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; void *value; int err; value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); if (!value) return; err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); if (!err) { btf_type_seq_show(st_map->btf, map->btf_vmlinux_value_type_id, value, m); seq_putc(m, '\n'); } kfree(value); } static void __bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; if (st_map->links) bpf_struct_ops_map_put_progs(st_map); if (st_map->ksyms) bpf_struct_ops_map_free_ksyms(st_map); bpf_map_area_free(st_map->links); bpf_map_area_free(st_map->ksyms); bpf_struct_ops_map_free_image(st_map); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); } static void bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; /* st_ops->owner was acquired during map_alloc to implicitly holds * the btf's refcnt. The acquire was only done when btf_is_module() * st_map->btf cannot be NULL here. */ if (btf_is_module(st_map->btf)) module_put(st_map->st_ops_desc->st_ops->owner); bpf_struct_ops_map_del_ksyms(st_map); /* The struct_ops's function may switch to another struct_ops. * * For example, bpf_tcp_cc_x->init() may switch to * another tcp_cc_y by calling * setsockopt(TCP_CONGESTION, "tcp_cc_y"). * During the switch, bpf_struct_ops_put(tcp_cc_x) is called * and its refcount may reach 0 which then free its * trampoline image while tcp_cc_x is still running. * * A vanilla rcu gp is to wait for all bpf-tcp-cc prog * to finish. bpf-tcp-cc prog is non sleepable. * A rcu_tasks gp is to wait for the last few insn * in the tramopline image to finish before releasing * the trampoline image. */ synchronize_rcu_mult(call_rcu, call_rcu_tasks); __bpf_struct_ops_map_free(map); } static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) { if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || (attr->map_flags & ~(BPF_F_LINK | BPF_F_VTYPE_BTF_OBJ_FD)) || !attr->btf_vmlinux_value_type_id) return -EINVAL; return 0; } static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t) { int i; u32 count; const struct btf_member *member; count = 0; for_each_member(i, t, member) if (btf_type_resolve_func_ptr(btf, member->type, NULL)) count++; return count; } static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) { const struct bpf_struct_ops_desc *st_ops_desc; size_t st_map_size; struct bpf_struct_ops_map *st_map; const struct btf_type *t, *vt; struct module *mod = NULL; struct bpf_map *map; struct btf *btf; int ret; if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) { /* The map holds btf for its whole life time. */ btf = btf_get_by_fd(attr->value_type_btf_obj_fd); if (IS_ERR(btf)) return ERR_CAST(btf); if (!btf_is_module(btf)) { btf_put(btf); return ERR_PTR(-EINVAL); } mod = btf_try_get_module(btf); /* mod holds a refcnt to btf. We don't need an extra refcnt * here. */ btf_put(btf); if (!mod) return ERR_PTR(-EINVAL); } else { btf = bpf_get_btf_vmlinux(); if (IS_ERR(btf)) return ERR_CAST(btf); if (!btf) return ERR_PTR(-ENOTSUPP); } st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id); if (!st_ops_desc) { ret = -ENOTSUPP; goto errout; } vt = st_ops_desc->value_type; if (attr->value_size != vt->size) { ret = -EINVAL; goto errout; } t = st_ops_desc->type; st_map_size = sizeof(*st_map) + /* kvalue stores the * struct bpf_struct_ops_tcp_congestions_ops */ (vt->size - sizeof(struct bpf_struct_ops_value)); st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); if (!st_map) { ret = -ENOMEM; goto errout; } st_map->st_ops_desc = st_ops_desc; map = &st_map->map; st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); st_map->funcs_cnt = count_func_ptrs(btf, t); st_map->links = bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *), NUMA_NO_NODE); st_map->ksyms = bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *), NUMA_NO_NODE); if (!st_map->uvalue || !st_map->links || !st_map->ksyms) { ret = -ENOMEM; goto errout_free; } st_map->btf = btf; mutex_init(&st_map->lock); bpf_map_init_from_attr(map, attr); return map; errout_free: __bpf_struct_ops_map_free(map); errout: module_put(mod); return ERR_PTR(ret); } static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc; const struct btf_type *vt = st_ops_desc->value_type; u64 usage; usage = sizeof(*st_map) + vt->size - sizeof(struct bpf_struct_ops_value); usage += vt->size; usage += st_map->funcs_cnt * sizeof(struct bpf_link *); usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *); usage += PAGE_SIZE; return usage; } BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, .map_alloc = bpf_struct_ops_map_alloc, .map_free = bpf_struct_ops_map_free, .map_get_next_key = bpf_struct_ops_map_get_next_key, .map_lookup_elem = bpf_struct_ops_map_lookup_elem, .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, .map_mem_usage = bpf_struct_ops_map_mem_usage, .map_btf_id = &bpf_struct_ops_map_btf_ids[0], }; /* "const void *" because some subsystem is * passing a const (e.g. const struct tcp_congestion_ops *) */ bool bpf_struct_ops_get(const void *kdata) { struct bpf_struct_ops_value *kvalue; struct bpf_struct_ops_map *st_map; struct bpf_map *map; kvalue = container_of(kdata, struct bpf_struct_ops_value, data); st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); map = __bpf_map_inc_not_zero(&st_map->map, false); return !IS_ERR(map); } void bpf_struct_ops_put(const void *kdata) { struct bpf_struct_ops_value *kvalue; struct bpf_struct_ops_map *st_map; kvalue = container_of(kdata, struct bpf_struct_ops_value, data); st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); bpf_map_put(&st_map->map); } static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && map->map_flags & BPF_F_LINK && /* Pair with smp_store_release() during map_update */ smp_load_acquire(&st_map->kvalue.common.state) == BPF_STRUCT_OPS_STATE_READY; } static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) { struct bpf_struct_ops_link *st_link; struct bpf_struct_ops_map *st_map; st_link = container_of(link, struct bpf_struct_ops_link, link); st_map = (struct bpf_struct_ops_map *) rcu_dereference_protected(st_link->map, true); if (st_map) { st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); bpf_map_put(&st_map->map); } kfree(st_link); } static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_struct_ops_link *st_link; struct bpf_map *map; st_link = container_of(link, struct bpf_struct_ops_link, link); rcu_read_lock(); map = rcu_dereference(st_link->map); if (map) seq_printf(seq, "map_id:\t%d\n", map->id); rcu_read_unlock(); } static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_struct_ops_link *st_link; struct bpf_map *map; st_link = container_of(link, struct bpf_struct_ops_link, link); rcu_read_lock(); map = rcu_dereference(st_link->map); if (map) info->struct_ops.map_id = map->id; rcu_read_unlock(); return 0; } static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, struct bpf_map *expected_old_map) { struct bpf_struct_ops_map *st_map, *old_st_map; struct bpf_map *old_map; struct bpf_struct_ops_link *st_link; int err; st_link = container_of(link, struct bpf_struct_ops_link, link); st_map = container_of(new_map, struct bpf_struct_ops_map, map); if (!bpf_struct_ops_valid_to_reg(new_map)) return -EINVAL; if (!st_map->st_ops_desc->st_ops->update) return -EOPNOTSUPP; mutex_lock(&update_mutex); old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); if (!old_map) { err = -ENOLINK; goto err_out; } if (expected_old_map && old_map != expected_old_map) { err = -EPERM; goto err_out; } old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); /* The new and old struct_ops must be the same type. */ if (st_map->st_ops_desc != old_st_map->st_ops_desc) { err = -EINVAL; goto err_out; } err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link); if (err) goto err_out; bpf_map_inc(new_map); rcu_assign_pointer(st_link->map, new_map); bpf_map_put(old_map); err_out: mutex_unlock(&update_mutex); return err; } static int bpf_struct_ops_map_link_detach(struct bpf_link *link) { struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); struct bpf_struct_ops_map *st_map; struct bpf_map *map; mutex_lock(&update_mutex); map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); if (!map) { mutex_unlock(&update_mutex); return 0; } st_map = container_of(map, struct bpf_struct_ops_map, map); st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); RCU_INIT_POINTER(st_link->map, NULL); /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or * bpf_map_inc() in bpf_struct_ops_map_link_update(). */ bpf_map_put(&st_map->map); mutex_unlock(&update_mutex); wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP); return 0; } static __poll_t bpf_struct_ops_map_link_poll(struct file *file, struct poll_table_struct *pts) { struct bpf_struct_ops_link *st_link = file->private_data; poll_wait(file, &st_link->wait_hup, pts); return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP; } static const struct bpf_link_ops bpf_struct_ops_map_lops = { .dealloc = bpf_struct_ops_map_link_dealloc, .detach = bpf_struct_ops_map_link_detach, .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, .fill_link_info = bpf_struct_ops_map_link_fill_link_info, .update_map = bpf_struct_ops_map_link_update, .poll = bpf_struct_ops_map_link_poll, }; int bpf_struct_ops_link_create(union bpf_attr *attr) { struct bpf_struct_ops_link *link = NULL; struct bpf_link_primer link_primer; struct bpf_struct_ops_map *st_map; struct bpf_map *map; int err; map = bpf_map_get(attr->link_create.map_fd); if (IS_ERR(map)) return PTR_ERR(map); st_map = (struct bpf_struct_ops_map *)map; if (!bpf_struct_ops_valid_to_reg(map)) { err = -EINVAL; goto err_out; } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto err_out; } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); err = bpf_link_prime(&link->link, &link_primer); if (err) goto err_out; init_waitqueue_head(&link->wait_hup); /* Hold the update_mutex such that the subsystem cannot * do link->ops->detach() before the link is fully initialized. */ mutex_lock(&update_mutex); err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); if (err) { mutex_unlock(&update_mutex); bpf_link_cleanup(&link_primer); link = NULL; goto err_out; } RCU_INIT_POINTER(link->map, map); mutex_unlock(&update_mutex); return bpf_link_settle(&link_primer); err_out: bpf_map_put(map); kfree(link); return err; } void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; info->btf_vmlinux_id = btf_obj_id(st_map->btf); }
1608 1608 1608 1609 138 139 139 139 114 138 139 1534 1536 64 64 64 555 36 36 114 114 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/file.c - sysfs regular (text) file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/module.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/mm.h> #include "sysfs.h" static struct kobject *sysfs_file_kobj(struct kernfs_node *kn) { guard(rcu)(); return rcu_dereference(kn->__parent)->priv; } /* * Determine ktype->sysfs_ops for the given kernfs_node. This function * must be called while holding an active reference. */ static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn) { struct kobject *kobj = sysfs_file_kobj(kn); if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; } /* * Reads on sysfs are handled through seq_file, which takes care of hairy * details like buffering and seeking. The following function pipes * sysfs_ops->show() result through seq_file. */ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; struct kobject *kobj = sysfs_file_kobj(of->kn); const struct sysfs_ops *ops = sysfs_file_ops(of->kn); ssize_t count; char *buf; if (WARN_ON_ONCE(!ops->show)) return -EINVAL; /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ count = seq_get_buf(sf, &buf); if (count < PAGE_SIZE) { seq_commit(sf, -1); return 0; } memset(buf, 0, PAGE_SIZE); count = ops->show(kobj, of->kn->priv, buf); if (count < 0) return count; /* * The code works fine with PAGE_SIZE return but it's likely to * indicate truncated result or overflow in normal use cases. */ if (count >= (ssize_t)PAGE_SIZE) { printk("fill_read_buffer: %pS returned bad count\n", ops->show); /* Try to struggle along */ count = PAGE_SIZE - 1; } seq_commit(sf, count); return 0; } static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); loff_t size = file_inode(of->file)->i_size; if (!count) return 0; if (size) { if (pos >= size) return 0; if (pos + count > size) count = size - pos; } if (!battr->read && !battr->read_new) return -EIO; if (battr->read_new) return battr->read_new(of->file, kobj, battr, buf, pos, count); return battr->read(of->file, kobj, battr, buf, pos, count); } /* kernfs read callback for regular sysfs files with pre-alloc */ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = sysfs_file_kobj(of->kn); ssize_t len; /* * If buf != of->prealloc_buf, we don't know how * large it is, so cannot safely pass it to ->show */ if (WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; len = ops->show(kobj, of->kn->priv, buf); if (len < 0) return len; if (pos) { if (len <= pos) return 0; len -= pos; memmove(buf, buf + pos, len); } return min_t(ssize_t, count, len); } /* kernfs write callback for regular sysfs files */ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = sysfs_file_kobj(of->kn); if (!count) return 0; return ops->store(kobj, of->kn->priv, buf, count); } /* kernfs write callback for bin sysfs files */ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); loff_t size = file_inode(of->file)->i_size; if (size) { if (size <= pos) return -EFBIG; count = min_t(ssize_t, count, size - pos); } if (!count) return 0; if (!battr->write && !battr->write_new) return -EIO; if (battr->write_new) return battr->write_new(of->file, kobj, battr, buf, pos, count); return battr->write(of->file, kobj, battr, buf, pos, count); } static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, struct vm_area_struct *vma) { struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); return battr->mmap(of->file, kobj, battr, vma); } static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset, int whence) { struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); if (battr->llseek) return battr->llseek(of->file, kobj, battr, offset, whence); else return generic_file_llseek(of->file, offset, whence); } static int sysfs_kf_bin_open(struct kernfs_open_file *of) { struct bin_attribute *battr = of->kn->priv; if (battr->f_mapping) of->file->f_mapping = battr->f_mapping(); return 0; } void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { struct kernfs_node *kn = kobj->sd, *tmp; if (kn && dir) kn = kernfs_find_and_get(kn, dir); else kernfs_get(kn); if (kn && attr) { tmp = kernfs_find_and_get(kn, attr); kernfs_put(kn); kn = tmp; } if (kn) { kernfs_notify(kn); kernfs_put(kn); } } EXPORT_SYMBOL_GPL(sysfs_notify); static const struct kernfs_ops sysfs_file_kfops_empty = { }; static const struct kernfs_ops sysfs_file_kfops_ro = { .seq_show = sysfs_kf_seq_show, }; static const struct kernfs_ops sysfs_file_kfops_wo = { .write = sysfs_kf_write, }; static const struct kernfs_ops sysfs_file_kfops_rw = { .seq_show = sysfs_kf_seq_show, .write = sysfs_kf_write, }; static const struct kernfs_ops sysfs_prealloc_kfops_ro = { .read = sysfs_kf_read, .prealloc = true, }; static const struct kernfs_ops sysfs_prealloc_kfops_wo = { .write = sysfs_kf_write, .prealloc = true, }; static const struct kernfs_ops sysfs_prealloc_kfops_rw = { .read = sysfs_kf_read, .write = sysfs_kf_write, .prealloc = true, }; static const struct kernfs_ops sysfs_bin_kfops_ro = { .read = sysfs_kf_bin_read, }; static const struct kernfs_ops sysfs_bin_kfops_wo = { .write = sysfs_kf_bin_write, }; static const struct kernfs_ops sysfs_bin_kfops_rw = { .read = sysfs_kf_bin_read, .write = sysfs_kf_bin_write, }; static const struct kernfs_ops sysfs_bin_kfops_mmap = { .read = sysfs_kf_bin_read, .write = sysfs_kf_bin_write, .mmap = sysfs_kf_bin_mmap, .open = sysfs_kf_bin_open, .llseek = sysfs_kf_bin_llseek, }; int sysfs_add_file_mode_ns(struct kernfs_node *parent, const struct attribute *attr, umode_t mode, kuid_t uid, kgid_t gid, const void *ns) { struct kobject *kobj = parent->priv; const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; struct lock_class_key *key = NULL; const struct kernfs_ops *ops = NULL; struct kernfs_node *kn; /* every kobject with an attribute needs a ktype assigned */ if (WARN(!sysfs_ops, KERN_ERR "missing sysfs attribute operations for kobject: %s\n", kobject_name(kobj))) return -EINVAL; if (mode & SYSFS_PREALLOC) { if (sysfs_ops->show && sysfs_ops->store) ops = &sysfs_prealloc_kfops_rw; else if (sysfs_ops->show) ops = &sysfs_prealloc_kfops_ro; else if (sysfs_ops->store) ops = &sysfs_prealloc_kfops_wo; } else { if (sysfs_ops->show && sysfs_ops->store) ops = &sysfs_file_kfops_rw; else if (sysfs_ops->show) ops = &sysfs_file_kfops_ro; else if (sysfs_ops->store) ops = &sysfs_file_kfops_wo; } if (!ops) ops = &sysfs_file_kfops_empty; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid, PAGE_SIZE, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); return PTR_ERR(kn); } return 0; } int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, const struct bin_attribute *battr, umode_t mode, size_t size, kuid_t uid, kgid_t gid, const void *ns) { const struct attribute *attr = &battr->attr; struct lock_class_key *key = NULL; const struct kernfs_ops *ops; struct kernfs_node *kn; if (battr->read && battr->read_new) return -EINVAL; if (battr->write && battr->write_new) return -EINVAL; if (battr->mmap) ops = &sysfs_bin_kfops_mmap; else if ((battr->read || battr->read_new) && (battr->write || battr->write_new)) ops = &sysfs_bin_kfops_rw; else if (battr->read || battr->read_new) ops = &sysfs_bin_kfops_ro; else if (battr->write || battr->write_new) ops = &sysfs_bin_kfops_wo; else ops = &sysfs_file_kfops_empty; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid, size, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); return PTR_ERR(kn); } return 0; } /** * sysfs_create_file_ns - create an attribute file for an object with custom ns * @kobj: object we're creating for * @attr: attribute descriptor * @ns: namespace the new file should belong to */ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { kuid_t uid; kgid_t gid; if (WARN_ON(!kobj || !kobj->sd || !attr)) return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); return sysfs_add_file_mode_ns(kobj->sd, attr, attr->mode, uid, gid, ns); } EXPORT_SYMBOL_GPL(sysfs_create_file_ns); int sysfs_create_files(struct kobject *kobj, const struct attribute * const *ptr) { int err = 0; int i; for (i = 0; ptr[i] && !err; i++) err = sysfs_create_file(kobj, ptr[i]); if (err) while (--i >= 0) sysfs_remove_file(kobj, ptr[i]); return err; } EXPORT_SYMBOL_GPL(sysfs_create_files); /** * sysfs_add_file_to_group - add an attribute file to a pre-existing group. * @kobj: object we're acting for. * @attr: attribute descriptor. * @group: group name. */ int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { struct kernfs_node *parent; kuid_t uid; kgid_t gid; int error; if (group) { parent = kernfs_find_and_get(kobj->sd, group); } else { parent = kobj->sd; kernfs_get(parent); } if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); error = sysfs_add_file_mode_ns(parent, attr, attr->mode, uid, gid, NULL); kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); /** * sysfs_chmod_file - update the modified mode value on an object attribute. * @kobj: object we're acting for. * @attr: attribute descriptor. * @mode: file permissions. * */ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { struct kernfs_node *kn; struct iattr newattrs; int rc; kn = kernfs_find_and_get(kobj->sd, attr->name); if (!kn) return -ENOENT; newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE; rc = kernfs_setattr(kn, &newattrs); kernfs_put(kn); return rc; } EXPORT_SYMBOL_GPL(sysfs_chmod_file); /** * sysfs_break_active_protection - break "active" protection * @kobj: The kernel object @attr is associated with. * @attr: The attribute to break the "active" protection for. * * With sysfs, just like kernfs, deletion of an attribute is postponed until * all active .show() and .store() callbacks have finished unless this function * is called. Hence this function is useful in methods that implement self * deletion. */ struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { struct kernfs_node *kn; kobject_get(kobj); kn = kernfs_find_and_get(kobj->sd, attr->name); if (kn) kernfs_break_active_protection(kn); else kobject_put(kobj); return kn; } EXPORT_SYMBOL_GPL(sysfs_break_active_protection); /** * sysfs_unbreak_active_protection - restore "active" protection * @kn: Pointer returned by sysfs_break_active_protection(). * * Undo the effects of sysfs_break_active_protection(). Since this function * calls kernfs_put() on the kernfs node that corresponds to the 'attr' * argument passed to sysfs_break_active_protection() that attribute may have * been removed between the sysfs_break_active_protection() and * sysfs_unbreak_active_protection() calls, it is not safe to access @kn after * this function has returned. */ void sysfs_unbreak_active_protection(struct kernfs_node *kn) { struct kobject *kobj = sysfs_file_kobj(kn); kernfs_unbreak_active_protection(kn); kernfs_put(kn); kobject_put(kobj); } EXPORT_SYMBOL_GPL(sysfs_unbreak_active_protection); /** * sysfs_remove_file_ns - remove an object attribute with a custom ns tag * @kobj: object we're acting for * @attr: attribute descriptor * @ns: namespace tag of the file to remove * * Hash the attribute name and namespace tag and kill the victim. */ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { struct kernfs_node *parent = kobj->sd; kernfs_remove_by_name_ns(parent, attr->name, ns); } EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); /** * sysfs_remove_file_self - remove an object attribute from its own method * @kobj: object we're acting for * @attr: attribute descriptor * * See kernfs_remove_self() for details. */ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { struct kernfs_node *parent = kobj->sd; struct kernfs_node *kn; bool ret; kn = kernfs_find_and_get(parent, attr->name); if (WARN_ON_ONCE(!kn)) return false; ret = kernfs_remove_self(kn); kernfs_put(kn); return ret; } EXPORT_SYMBOL_GPL(sysfs_remove_file_self); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr) { int i; for (i = 0; ptr[i]; i++) sysfs_remove_file(kobj, ptr[i]); } EXPORT_SYMBOL_GPL(sysfs_remove_files); /** * sysfs_remove_file_from_group - remove an attribute file from a group. * @kobj: object we're acting for. * @attr: attribute descriptor. * @group: group name. */ void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { struct kernfs_node *parent; if (group) { parent = kernfs_find_and_get(kobj->sd, group); } else { parent = kobj->sd; kernfs_get(parent); } if (parent) { kernfs_remove_by_name(parent, attr->name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); /** * sysfs_create_bin_file - create binary file for object. * @kobj: object. * @attr: attribute descriptor. */ int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { kuid_t uid; kgid_t gid; if (WARN_ON(!kobj || !kobj->sd || !attr)) return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode, attr->size, uid, gid, NULL); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); /** * sysfs_remove_bin_file - remove binary file for object. * @kobj: object. * @attr: attribute descriptor. */ void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid, kgid_t kgid) { struct iattr newattrs = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = kuid, .ia_gid = kgid, }; return kernfs_setattr(kn, &newattrs); } /** * sysfs_link_change_owner - change owner of a sysfs file. * @kobj: object of the kernfs_node the symlink is located in. * @targ: object of the kernfs_node the symlink points to. * @name: name of the link. * @kuid: new owner's kuid * @kgid: new owner's kgid * * This function looks up the sysfs symlink entry @name under @kobj and changes * the ownership to @kuid/@kgid. The symlink is looked up in the namespace of * @targ. * * Returns 0 on success or error code on failure. */ int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { struct kernfs_node *kn = NULL; int error; if (!name || !kobj->state_in_sysfs || !targ->state_in_sysfs) return -EINVAL; error = -ENOENT; kn = kernfs_find_and_get_ns(kobj->sd, name, targ->sd->ns); if (!kn) goto out; error = -EINVAL; if (kernfs_type(kn) != KERNFS_LINK) goto out; if (kn->symlink.target_kn->priv != targ) goto out; error = internal_change_owner(kn, kuid, kgid); out: kernfs_put(kn); return error; } /** * sysfs_file_change_owner - change owner of a sysfs file. * @kobj: object. * @name: name of the file to change. * @kuid: new owner's kuid * @kgid: new owner's kgid * * This function looks up the sysfs entry @name under @kobj and changes the * ownership to @kuid/@kgid. * * Returns 0 on success or error code on failure. */ int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { struct kernfs_node *kn; int error; if (!name) return -EINVAL; if (!kobj->state_in_sysfs) return -EINVAL; kn = kernfs_find_and_get(kobj->sd, name); if (!kn) return -ENOENT; error = internal_change_owner(kn, kuid, kgid); kernfs_put(kn); return error; } EXPORT_SYMBOL_GPL(sysfs_file_change_owner); /** * sysfs_change_owner - change owner of the given object. * @kobj: object. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Change the owner of the default directory, files, groups, and attributes of * @kobj to @kuid/@kgid. Note that sysfs_change_owner mirrors how the sysfs * entries for a kobject are added by driver core. In summary, * sysfs_change_owner() takes care of the default directory entry for @kobj, * the default attributes associated with the ktype of @kobj and the default * attributes associated with the ktype of @kobj. * Additional properties not added by driver core have to be changed by the * driver or subsystem which created them. This is similar to how * driver/subsystem specific entries are removed. * * Returns 0 on success or error code on failure. */ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { int error; const struct kobj_type *ktype; if (!kobj->state_in_sysfs) return -EINVAL; /* Change the owner of the kobject itself. */ error = internal_change_owner(kobj->sd, kuid, kgid); if (error) return error; ktype = get_ktype(kobj); if (ktype) { /* * Change owner of the default groups associated with the * ktype of @kobj. */ error = sysfs_groups_change_owner(kobj, ktype->default_groups, kuid, kgid); if (error) return error; } return 0; } EXPORT_SYMBOL_GPL(sysfs_change_owner); /** * sysfs_emit - scnprintf equivalent, aware of PAGE_SIZE buffer. * @buf: start of PAGE_SIZE buffer. * @fmt: format * @...: optional arguments to @format * * * Returns number of characters written to @buf. */ int sysfs_emit(char *buf, const char *fmt, ...) { va_list args; int len; if (WARN(!buf || offset_in_page(buf), "invalid sysfs_emit: buf:%p\n", buf)) return 0; va_start(args, fmt); len = vscnprintf(buf, PAGE_SIZE, fmt, args); va_end(args); return len; } EXPORT_SYMBOL_GPL(sysfs_emit); /** * sysfs_emit_at - scnprintf equivalent, aware of PAGE_SIZE buffer. * @buf: start of PAGE_SIZE buffer. * @at: offset in @buf to start write in bytes * @at must be >= 0 && < PAGE_SIZE * @fmt: format * @...: optional arguments to @fmt * * * Returns number of characters written starting at &@buf[@at]. */ int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { va_list args; int len; if (WARN(!buf || offset_in_page(buf) || at < 0 || at >= PAGE_SIZE, "invalid sysfs_emit_at: buf:%p at:%d\n", buf, at)) return 0; va_start(args, fmt); len = vscnprintf(buf + at, PAGE_SIZE - at, fmt, args); va_end(args); return len; } EXPORT_SYMBOL_GPL(sysfs_emit_at); /** * sysfs_bin_attr_simple_read - read callback to simply copy from memory. * @file: attribute file which is being read. * @kobj: object to which the attribute belongs. * @attr: attribute descriptor. * @buf: destination buffer. * @off: offset in bytes from which to read. * @count: maximum number of bytes to read. * * Simple ->read() callback for bin_attributes backed by a buffer in memory. * The @private and @size members in struct bin_attribute must be set to the * buffer's location and size before the bin_attribute is created in sysfs. * * Bounds check for @off and @count is done in sysfs_kf_bin_read(). * Negative value check for @off is done in vfs_setpos() and default_llseek(). * * Returns number of bytes written to @buf. */ ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { memcpy(buf, attr->private + off, count); return count; } EXPORT_SYMBOL_GPL(sysfs_bin_attr_simple_read);
6 5 9 9 9 92 92 9 92 92 91 18 13 9 9 505 506 85 86 86 86 18 86 86 86 86 33 18 33 33 33 89 63 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/wakeup.c - System wakeup events framework * * Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. */ #define pr_fmt(fmt) "PM: " fmt #include <linux/device.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/capability.h> #include <linux/export.h> #include <linux/suspend.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/pm_wakeirq.h> #include <trace/events/power.h> #include "power.h" #define list_for_each_entry_rcu_locked(pos, head, member) \ list_for_each_entry_rcu(pos, head, member, \ srcu_read_lock_held(&wakeup_srcu)) /* * If set, the suspend/hibernate code will abort transitions to a sleep state * if wakeup events are registered during or immediately before the transition. */ bool events_check_enabled __read_mostly; /* First wakeup IRQ seen by the kernel in the last cycle. */ static unsigned int wakeup_irq[2] __read_mostly; static DEFINE_RAW_SPINLOCK(wakeup_irq_lock); /* If greater than 0 and the system is suspending, terminate the suspend. */ static atomic_t pm_abort_suspend __read_mostly; /* * Combined counters of registered wakeup events and wakeup events in progress. * They need to be modified together atomically, so it's better to use one * atomic variable to hold them both. */ static atomic_t combined_event_count = ATOMIC_INIT(0); #define IN_PROGRESS_BITS (sizeof(int) * 4) #define MAX_IN_PROGRESS ((1 << IN_PROGRESS_BITS) - 1) static void split_counters(unsigned int *cnt, unsigned int *inpr) { unsigned int comb = atomic_read(&combined_event_count); *cnt = (comb >> IN_PROGRESS_BITS); *inpr = comb & MAX_IN_PROGRESS; } /* A preserved old value of the events counter. */ static unsigned int saved_count; static DEFINE_RAW_SPINLOCK(events_lock); static void pm_wakeup_timer_fn(struct timer_list *t); static LIST_HEAD(wakeup_sources); static DECLARE_WAIT_QUEUE_HEAD(wakeup_count_wait_queue); DEFINE_STATIC_SRCU(wakeup_srcu); static struct wakeup_source deleted_ws = { .name = "deleted", .lock = __SPIN_LOCK_UNLOCKED(deleted_ws.lock), }; static DEFINE_IDA(wakeup_ida); /** * wakeup_source_create - Create a struct wakeup_source object. * @name: Name of the new wakeup source. */ struct wakeup_source *wakeup_source_create(const char *name) { struct wakeup_source *ws; const char *ws_name; int id; ws = kzalloc(sizeof(*ws), GFP_KERNEL); if (!ws) goto err_ws; ws_name = kstrdup_const(name, GFP_KERNEL); if (!ws_name) goto err_name; ws->name = ws_name; id = ida_alloc(&wakeup_ida, GFP_KERNEL); if (id < 0) goto err_id; ws->id = id; return ws; err_id: kfree_const(ws->name); err_name: kfree(ws); err_ws: return NULL; } EXPORT_SYMBOL_GPL(wakeup_source_create); /* * Record wakeup_source statistics being deleted into a dummy wakeup_source. */ static void wakeup_source_record(struct wakeup_source *ws) { unsigned long flags; spin_lock_irqsave(&deleted_ws.lock, flags); if (ws->event_count) { deleted_ws.total_time = ktime_add(deleted_ws.total_time, ws->total_time); deleted_ws.prevent_sleep_time = ktime_add(deleted_ws.prevent_sleep_time, ws->prevent_sleep_time); deleted_ws.max_time = ktime_compare(deleted_ws.max_time, ws->max_time) > 0 ? deleted_ws.max_time : ws->max_time; deleted_ws.event_count += ws->event_count; deleted_ws.active_count += ws->active_count; deleted_ws.relax_count += ws->relax_count; deleted_ws.expire_count += ws->expire_count; deleted_ws.wakeup_count += ws->wakeup_count; } spin_unlock_irqrestore(&deleted_ws.lock, flags); } static void wakeup_source_free(struct wakeup_source *ws) { ida_free(&wakeup_ida, ws->id); kfree_const(ws->name); kfree(ws); } /** * wakeup_source_destroy - Destroy a struct wakeup_source object. * @ws: Wakeup source to destroy. * * Use only for wakeup source objects created with wakeup_source_create(). */ void wakeup_source_destroy(struct wakeup_source *ws) { if (!ws) return; __pm_relax(ws); wakeup_source_record(ws); wakeup_source_free(ws); } EXPORT_SYMBOL_GPL(wakeup_source_destroy); /** * wakeup_source_add - Add given object to the list of wakeup sources. * @ws: Wakeup source object to add to the list. */ void wakeup_source_add(struct wakeup_source *ws) { unsigned long flags; if (WARN_ON(!ws)) return; spin_lock_init(&ws->lock); timer_setup(&ws->timer, pm_wakeup_timer_fn, 0); ws->active = false; raw_spin_lock_irqsave(&events_lock, flags); list_add_rcu(&ws->entry, &wakeup_sources); raw_spin_unlock_irqrestore(&events_lock, flags); } EXPORT_SYMBOL_GPL(wakeup_source_add); /** * wakeup_source_remove - Remove given object from the wakeup sources list. * @ws: Wakeup source object to remove from the list. */ void wakeup_source_remove(struct wakeup_source *ws) { unsigned long flags; if (WARN_ON(!ws)) return; raw_spin_lock_irqsave(&events_lock, flags); list_del_rcu(&ws->entry); raw_spin_unlock_irqrestore(&events_lock, flags); synchronize_srcu(&wakeup_srcu); del_timer_sync(&ws->timer); /* * Clear timer.function to make wakeup_source_not_registered() treat * this wakeup source as not registered. */ ws->timer.function = NULL; } EXPORT_SYMBOL_GPL(wakeup_source_remove); /** * wakeup_source_register - Create wakeup source and add it to the list. * @dev: Device this wakeup source is associated with (or NULL if virtual). * @name: Name of the wakeup source to register. */ struct wakeup_source *wakeup_source_register(struct device *dev, const char *name) { struct wakeup_source *ws; int ret; ws = wakeup_source_create(name); if (ws) { if (!dev || device_is_registered(dev)) { ret = wakeup_source_sysfs_add(dev, ws); if (ret) { wakeup_source_free(ws); return NULL; } } wakeup_source_add(ws); } return ws; } EXPORT_SYMBOL_GPL(wakeup_source_register); /** * wakeup_source_unregister - Remove wakeup source from the list and remove it. * @ws: Wakeup source object to unregister. */ void wakeup_source_unregister(struct wakeup_source *ws) { if (ws) { wakeup_source_remove(ws); if (ws->dev) wakeup_source_sysfs_remove(ws); wakeup_source_destroy(ws); } } EXPORT_SYMBOL_GPL(wakeup_source_unregister); /** * wakeup_sources_read_lock - Lock wakeup source list for read. * * Returns an index of srcu lock for struct wakeup_srcu. * This index must be passed to the matching wakeup_sources_read_unlock(). */ int wakeup_sources_read_lock(void) { return srcu_read_lock(&wakeup_srcu); } EXPORT_SYMBOL_GPL(wakeup_sources_read_lock); /** * wakeup_sources_read_unlock - Unlock wakeup source list. * @idx: return value from corresponding wakeup_sources_read_lock() */ void wakeup_sources_read_unlock(int idx) { srcu_read_unlock(&wakeup_srcu, idx); } EXPORT_SYMBOL_GPL(wakeup_sources_read_unlock); /** * wakeup_sources_walk_start - Begin a walk on wakeup source list * * Returns first object of the list of wakeup sources. * * Note that to be safe, wakeup sources list needs to be locked by calling * wakeup_source_read_lock() for this. */ struct wakeup_source *wakeup_sources_walk_start(void) { struct list_head *ws_head = &wakeup_sources; return list_entry_rcu(ws_head->next, struct wakeup_source, entry); } EXPORT_SYMBOL_GPL(wakeup_sources_walk_start); /** * wakeup_sources_walk_next - Get next wakeup source from the list * @ws: Previous wakeup source object * * Note that to be safe, wakeup sources list needs to be locked by calling * wakeup_source_read_lock() for this. */ struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws) { struct list_head *ws_head = &wakeup_sources; return list_next_or_null_rcu(ws_head, &ws->entry, struct wakeup_source, entry); } EXPORT_SYMBOL_GPL(wakeup_sources_walk_next); /** * device_wakeup_attach - Attach a wakeup source object to a device object. * @dev: Device to handle. * @ws: Wakeup source object to attach to @dev. * * This causes @dev to be treated as a wakeup device. */ static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws) { spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { spin_unlock_irq(&dev->power.lock); return -EEXIST; } dev->power.wakeup = ws; if (dev->power.wakeirq) device_wakeup_attach_irq(dev, dev->power.wakeirq); spin_unlock_irq(&dev->power.lock); return 0; } /** * device_wakeup_enable - Enable given device to be a wakeup source. * @dev: Device to handle. * * Create a wakeup source object, register it and attach it to @dev. */ int device_wakeup_enable(struct device *dev) { struct wakeup_source *ws; int ret; if (!dev || !dev->power.can_wakeup) return -EINVAL; if (pm_suspend_target_state != PM_SUSPEND_ON) dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__); ws = wakeup_source_register(dev, dev_name(dev)); if (!ws) return -ENOMEM; ret = device_wakeup_attach(dev, ws); if (ret) wakeup_source_unregister(ws); return ret; } EXPORT_SYMBOL_GPL(device_wakeup_enable); /** * device_wakeup_attach_irq - Attach a wakeirq to a wakeup source * @dev: Device to handle * @wakeirq: Device specific wakeirq entry * * Attach a device wakeirq to the wakeup source so the device * wake IRQ can be configured automatically for suspend and * resume. * * Call under the device's power.lock lock. */ void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq) { struct wakeup_source *ws; ws = dev->power.wakeup; if (!ws) return; if (ws->wakeirq) dev_err(dev, "Leftover wakeup IRQ found, overriding\n"); ws->wakeirq = wakeirq; } /** * device_wakeup_detach_irq - Detach a wakeirq from a wakeup source * @dev: Device to handle * * Removes a device wakeirq from the wakeup source. * * Call under the device's power.lock lock. */ void device_wakeup_detach_irq(struct device *dev) { struct wakeup_source *ws; ws = dev->power.wakeup; if (ws) ws->wakeirq = NULL; } /** * device_wakeup_arm_wake_irqs - * * Iterates over the list of device wakeirqs to arm them. */ void device_wakeup_arm_wake_irqs(void) { struct wakeup_source *ws; int srcuidx; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) dev_pm_arm_wake_irq(ws->wakeirq); srcu_read_unlock(&wakeup_srcu, srcuidx); } /** * device_wakeup_disarm_wake_irqs - * * Iterates over the list of device wakeirqs to disarm them. */ void device_wakeup_disarm_wake_irqs(void) { struct wakeup_source *ws; int srcuidx; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) dev_pm_disarm_wake_irq(ws->wakeirq); srcu_read_unlock(&wakeup_srcu, srcuidx); } /** * device_wakeup_detach - Detach a device's wakeup source object from it. * @dev: Device to detach the wakeup source object from. * * After it returns, @dev will not be treated as a wakeup device any more. */ static struct wakeup_source *device_wakeup_detach(struct device *dev) { struct wakeup_source *ws; spin_lock_irq(&dev->power.lock); ws = dev->power.wakeup; dev->power.wakeup = NULL; spin_unlock_irq(&dev->power.lock); return ws; } /** * device_wakeup_disable - Do not regard a device as a wakeup source any more. * @dev: Device to handle. * * Detach the @dev's wakeup source object from it, unregister this wakeup source * object and destroy it. */ void device_wakeup_disable(struct device *dev) { struct wakeup_source *ws; if (!dev || !dev->power.can_wakeup) return; ws = device_wakeup_detach(dev); wakeup_source_unregister(ws); } EXPORT_SYMBOL_GPL(device_wakeup_disable); /** * device_set_wakeup_capable - Set/reset device wakeup capability flag. * @dev: Device to handle. * @capable: Whether or not @dev is capable of waking up the system from sleep. * * If @capable is set, set the @dev's power.can_wakeup flag and add its * wakeup-related attributes to sysfs. Otherwise, unset the @dev's * power.can_wakeup flag and remove its wakeup-related attributes from sysfs. * * This function may sleep and it can't be called from any context where * sleeping is not allowed. */ void device_set_wakeup_capable(struct device *dev, bool capable) { if (!!dev->power.can_wakeup == !!capable) return; dev->power.can_wakeup = capable; if (device_is_registered(dev) && !list_empty(&dev->power.entry)) { if (capable) { int ret = wakeup_sysfs_add(dev); if (ret) dev_info(dev, "Wakeup sysfs attributes not added\n"); } else { wakeup_sysfs_remove(dev); } } } EXPORT_SYMBOL_GPL(device_set_wakeup_capable); /** * device_set_wakeup_enable - Enable or disable a device to wake up the system. * @dev: Device to handle. * @enable: enable/disable flag */ int device_set_wakeup_enable(struct device *dev, bool enable) { if (enable) return device_wakeup_enable(dev); device_wakeup_disable(dev); return 0; } EXPORT_SYMBOL_GPL(device_set_wakeup_enable); /** * wakeup_source_not_registered - validate the given wakeup source. * @ws: Wakeup source to be validated. */ static bool wakeup_source_not_registered(struct wakeup_source *ws) { /* * Use timer struct to check if the given source is initialized * by wakeup_source_add. */ return ws->timer.function != pm_wakeup_timer_fn; } /* * The functions below use the observation that each wakeup event starts a * period in which the system should not be suspended. The moment this period * will end depends on how the wakeup event is going to be processed after being * detected and all of the possible cases can be divided into two distinct * groups. * * First, a wakeup event may be detected by the same functional unit that will * carry out the entire processing of it and possibly will pass it to user space * for further processing. In that case the functional unit that has detected * the event may later "close" the "no suspend" period associated with it * directly as soon as it has been dealt with. The pair of pm_stay_awake() and * pm_relax(), balanced with each other, is supposed to be used in such * situations. * * Second, a wakeup event may be detected by one functional unit and processed * by another one. In that case the unit that has detected it cannot really * "close" the "no suspend" period associated with it, unless it knows in * advance what's going to happen to the event during processing. This * knowledge, however, may not be available to it, so it can simply specify time * to wait before the system can be suspended and pass it as the second * argument of pm_wakeup_event(). * * It is valid to call pm_relax() after pm_wakeup_event(), in which case the * "no suspend" period will be ended either by the pm_relax(), or by the timer * function executed when the timer expires, whichever comes first. */ /** * wakeup_source_activate - Mark given wakeup source as active. * @ws: Wakeup source to handle. * * Update the @ws' statistics and, if @ws has just been activated, notify the PM * core of the event by incrementing the counter of the wakeup events being * processed. */ static void wakeup_source_activate(struct wakeup_source *ws) { unsigned int cec; if (WARN_ONCE(wakeup_source_not_registered(ws), "unregistered wakeup source\n")) return; ws->active = true; ws->active_count++; ws->last_time = ktime_get(); if (ws->autosleep_enabled) ws->start_prevent_time = ws->last_time; /* Increment the counter of events in progress. */ cec = atomic_inc_return(&combined_event_count); trace_wakeup_source_activate(ws->name, cec); } /** * wakeup_source_report_event - Report wakeup event using the given source. * @ws: Wakeup source to report the event for. * @hard: If set, abort suspends in progress and wake up from suspend-to-idle. */ static void wakeup_source_report_event(struct wakeup_source *ws, bool hard) { ws->event_count++; /* This is racy, but the counter is approximate anyway. */ if (events_check_enabled) ws->wakeup_count++; if (!ws->active) wakeup_source_activate(ws); if (hard) pm_system_wakeup(); } /** * __pm_stay_awake - Notify the PM core of a wakeup event. * @ws: Wakeup source object associated with the source of the event. * * It is safe to call this function from interrupt context. */ void __pm_stay_awake(struct wakeup_source *ws) { unsigned long flags; if (!ws) return; spin_lock_irqsave(&ws->lock, flags); wakeup_source_report_event(ws, false); del_timer(&ws->timer); ws->timer_expires = 0; spin_unlock_irqrestore(&ws->lock, flags); } EXPORT_SYMBOL_GPL(__pm_stay_awake); /** * pm_stay_awake - Notify the PM core that a wakeup event is being processed. * @dev: Device the wakeup event is related to. * * Notify the PM core of a wakeup event (signaled by @dev) by calling * __pm_stay_awake for the @dev's wakeup source object. * * Call this function after detecting of a wakeup event if pm_relax() is going * to be called directly after processing the event (and possibly passing it to * user space for further processing). */ void pm_stay_awake(struct device *dev) { unsigned long flags; if (!dev) return; spin_lock_irqsave(&dev->power.lock, flags); __pm_stay_awake(dev->power.wakeup); spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_stay_awake); #ifdef CONFIG_PM_AUTOSLEEP static void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now) { ktime_t delta = ktime_sub(now, ws->start_prevent_time); ws->prevent_sleep_time = ktime_add(ws->prevent_sleep_time, delta); } #else static inline void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now) {} #endif /** * wakeup_source_deactivate - Mark given wakeup source as inactive. * @ws: Wakeup source to handle. * * Update the @ws' statistics and notify the PM core that the wakeup source has * become inactive by decrementing the counter of wakeup events being processed * and incrementing the counter of registered wakeup events. */ static void wakeup_source_deactivate(struct wakeup_source *ws) { unsigned int cnt, inpr, cec; ktime_t duration; ktime_t now; ws->relax_count++; /* * __pm_relax() may be called directly or from a timer function. * If it is called directly right after the timer function has been * started, but before the timer function calls __pm_relax(), it is * possible that __pm_stay_awake() will be called in the meantime and * will set ws->active. Then, ws->active may be cleared immediately * by the __pm_relax() called from the timer function, but in such a * case ws->relax_count will be different from ws->active_count. */ if (ws->relax_count != ws->active_count) { ws->relax_count--; return; } ws->active = false; now = ktime_get(); duration = ktime_sub(now, ws->last_time); ws->total_time = ktime_add(ws->total_time, duration); if (ktime_to_ns(duration) > ktime_to_ns(ws->max_time)) ws->max_time = duration; ws->last_time = now; del_timer(&ws->timer); ws->timer_expires = 0; if (ws->autosleep_enabled) update_prevent_sleep_time(ws, now); /* * Increment the counter of registered wakeup events and decrement the * counter of wakeup events in progress simultaneously. */ cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count); trace_wakeup_source_deactivate(ws->name, cec); split_counters(&cnt, &inpr); if (!inpr && waitqueue_active(&wakeup_count_wait_queue)) wake_up(&wakeup_count_wait_queue); } /** * __pm_relax - Notify the PM core that processing of a wakeup event has ended. * @ws: Wakeup source object associated with the source of the event. * * Call this function for wakeup events whose processing started with calling * __pm_stay_awake(). * * It is safe to call it from interrupt context. */ void __pm_relax(struct wakeup_source *ws) { unsigned long flags; if (!ws) return; spin_lock_irqsave(&ws->lock, flags); if (ws->active) wakeup_source_deactivate(ws); spin_unlock_irqrestore(&ws->lock, flags); } EXPORT_SYMBOL_GPL(__pm_relax); /** * pm_relax - Notify the PM core that processing of a wakeup event has ended. * @dev: Device that signaled the event. * * Execute __pm_relax() for the @dev's wakeup source object. */ void pm_relax(struct device *dev) { unsigned long flags; if (!dev) return; spin_lock_irqsave(&dev->power.lock, flags); __pm_relax(dev->power.wakeup); spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_relax); /** * pm_wakeup_timer_fn - Delayed finalization of a wakeup event. * @t: timer list * * Call wakeup_source_deactivate() for the wakeup source whose address is stored * in @data if it is currently active and its timer has not been canceled and * the expiration time of the timer is not in future. */ static void pm_wakeup_timer_fn(struct timer_list *t) { struct wakeup_source *ws = from_timer(ws, t, timer); unsigned long flags; spin_lock_irqsave(&ws->lock, flags); if (ws->active && ws->timer_expires && time_after_eq(jiffies, ws->timer_expires)) { wakeup_source_deactivate(ws); ws->expire_count++; } spin_unlock_irqrestore(&ws->lock, flags); } /** * pm_wakeup_ws_event - Notify the PM core of a wakeup event. * @ws: Wakeup source object associated with the event source. * @msec: Anticipated event processing time (in milliseconds). * @hard: If set, abort suspends in progress and wake up from suspend-to-idle. * * Notify the PM core of a wakeup event whose source is @ws that will take * approximately @msec milliseconds to be processed by the kernel. If @ws is * not active, activate it. If @msec is nonzero, set up the @ws' timer to * execute pm_wakeup_timer_fn() in future. * * It is safe to call this function from interrupt context. */ void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard) { unsigned long flags; unsigned long expires; if (!ws) return; spin_lock_irqsave(&ws->lock, flags); wakeup_source_report_event(ws, hard); if (!msec) { wakeup_source_deactivate(ws); goto unlock; } expires = jiffies + msecs_to_jiffies(msec); if (!expires) expires = 1; if (!ws->timer_expires || time_after(expires, ws->timer_expires)) { mod_timer(&ws->timer, expires); ws->timer_expires = expires; } unlock: spin_unlock_irqrestore(&ws->lock, flags); } EXPORT_SYMBOL_GPL(pm_wakeup_ws_event); /** * pm_wakeup_dev_event - Notify the PM core of a wakeup event. * @dev: Device the wakeup event is related to. * @msec: Anticipated event processing time (in milliseconds). * @hard: If set, abort suspends in progress and wake up from suspend-to-idle. * * Call pm_wakeup_ws_event() for the @dev's wakeup source object. */ void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard) { unsigned long flags; if (!dev) return; spin_lock_irqsave(&dev->power.lock, flags); pm_wakeup_ws_event(dev->power.wakeup, msec, hard); spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_wakeup_dev_event); void pm_print_active_wakeup_sources(void) { struct wakeup_source *ws; int srcuidx, active = 0; struct wakeup_source *last_activity_ws = NULL; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) { if (ws->active) { pm_pr_dbg("active wakeup source: %s\n", ws->name); active = 1; } else if (!active && (!last_activity_ws || ktime_to_ns(ws->last_time) > ktime_to_ns(last_activity_ws->last_time))) { last_activity_ws = ws; } } if (!active && last_activity_ws) pm_pr_dbg("last active wakeup source: %s\n", last_activity_ws->name); srcu_read_unlock(&wakeup_srcu, srcuidx); } EXPORT_SYMBOL_GPL(pm_print_active_wakeup_sources); /** * pm_wakeup_pending - Check if power transition in progress should be aborted. * * Compare the current number of registered wakeup events with its preserved * value from the past and return true if new wakeup events have been registered * since the old value was stored. Also return true if the current number of * wakeup events being processed is different from zero. */ bool pm_wakeup_pending(void) { unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&events_lock, flags); if (events_check_enabled) { unsigned int cnt, inpr; split_counters(&cnt, &inpr); ret = (cnt != saved_count || inpr > 0); events_check_enabled = !ret; } raw_spin_unlock_irqrestore(&events_lock, flags); if (ret) { pm_pr_dbg("Wakeup pending, aborting suspend\n"); pm_print_active_wakeup_sources(); } return ret || atomic_read(&pm_abort_suspend) > 0; } EXPORT_SYMBOL_GPL(pm_wakeup_pending); void pm_system_wakeup(void) { atomic_inc(&pm_abort_suspend); s2idle_wake(); } EXPORT_SYMBOL_GPL(pm_system_wakeup); void pm_system_cancel_wakeup(void) { atomic_dec_if_positive(&pm_abort_suspend); } void pm_wakeup_clear(unsigned int irq_number) { raw_spin_lock_irq(&wakeup_irq_lock); if (irq_number && wakeup_irq[0] == irq_number) wakeup_irq[0] = wakeup_irq[1]; else wakeup_irq[0] = 0; wakeup_irq[1] = 0; raw_spin_unlock_irq(&wakeup_irq_lock); if (!irq_number) atomic_set(&pm_abort_suspend, 0); } void pm_system_irq_wakeup(unsigned int irq_number) { unsigned long flags; raw_spin_lock_irqsave(&wakeup_irq_lock, flags); if (wakeup_irq[0] == 0) wakeup_irq[0] = irq_number; else if (wakeup_irq[1] == 0) wakeup_irq[1] = irq_number; else irq_number = 0; pm_pr_dbg("Triggering wakeup from IRQ %d\n", irq_number); raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags); if (irq_number) pm_system_wakeup(); } unsigned int pm_wakeup_irq(void) { return wakeup_irq[0]; } /** * pm_get_wakeup_count - Read the number of registered wakeup events. * @count: Address to store the value at. * @block: Whether or not to block. * * Store the number of registered wakeup events at the address in @count. If * @block is set, block until the current number of wakeup events being * processed is zero. * * Return 'false' if the current number of wakeup events being processed is * nonzero. Otherwise return 'true'. */ bool pm_get_wakeup_count(unsigned int *count, bool block) { unsigned int cnt, inpr; if (block) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(&wakeup_count_wait_queue, &wait, TASK_INTERRUPTIBLE); split_counters(&cnt, &inpr); if (inpr == 0 || signal_pending(current)) break; pm_print_active_wakeup_sources(); schedule(); } finish_wait(&wakeup_count_wait_queue, &wait); } split_counters(&cnt, &inpr); *count = cnt; return !inpr; } /** * pm_save_wakeup_count - Save the current number of registered wakeup events. * @count: Value to compare with the current number of registered wakeup events. * * If @count is equal to the current number of registered wakeup events and the * current number of wakeup events being processed is zero, store @count as the * old number of registered wakeup events for pm_check_wakeup_events(), enable * wakeup events detection and return 'true'. Otherwise disable wakeup events * detection and return 'false'. */ bool pm_save_wakeup_count(unsigned int count) { unsigned int cnt, inpr; unsigned long flags; events_check_enabled = false; raw_spin_lock_irqsave(&events_lock, flags); split_counters(&cnt, &inpr); if (cnt == count && inpr == 0) { saved_count = count; events_check_enabled = true; } raw_spin_unlock_irqrestore(&events_lock, flags); return events_check_enabled; } #ifdef CONFIG_PM_AUTOSLEEP /** * pm_wakep_autosleep_enabled - Modify autosleep_enabled for all wakeup sources. * @set: Whether to set or to clear the autosleep_enabled flags. */ void pm_wakep_autosleep_enabled(bool set) { struct wakeup_source *ws; ktime_t now = ktime_get(); int srcuidx; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) { spin_lock_irq(&ws->lock); if (ws->autosleep_enabled != set) { ws->autosleep_enabled = set; if (ws->active) { if (set) ws->start_prevent_time = now; else update_prevent_sleep_time(ws, now); } } spin_unlock_irq(&ws->lock); } srcu_read_unlock(&wakeup_srcu, srcuidx); } #endif /* CONFIG_PM_AUTOSLEEP */ /** * print_wakeup_source_stats - Print wakeup source statistics information. * @m: seq_file to print the statistics into. * @ws: Wakeup source object to print the statistics for. */ static int print_wakeup_source_stats(struct seq_file *m, struct wakeup_source *ws) { unsigned long flags; ktime_t total_time; ktime_t max_time; unsigned long active_count; ktime_t active_time; ktime_t prevent_sleep_time; spin_lock_irqsave(&ws->lock, flags); total_time = ws->total_time; max_time = ws->max_time; prevent_sleep_time = ws->prevent_sleep_time; active_count = ws->active_count; if (ws->active) { ktime_t now = ktime_get(); active_time = ktime_sub(now, ws->last_time); total_time = ktime_add(total_time, active_time); if (active_time > max_time) max_time = active_time; if (ws->autosleep_enabled) prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(now, ws->start_prevent_time)); } else { active_time = 0; } seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n", ws->name, active_count, ws->event_count, ws->wakeup_count, ws->expire_count, ktime_to_ms(active_time), ktime_to_ms(total_time), ktime_to_ms(max_time), ktime_to_ms(ws->last_time), ktime_to_ms(prevent_sleep_time)); spin_unlock_irqrestore(&ws->lock, flags); return 0; } static void *wakeup_sources_stats_seq_start(struct seq_file *m, loff_t *pos) { struct wakeup_source *ws; loff_t n = *pos; int *srcuidx = m->private; if (n == 0) { seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t" "expire_count\tactive_since\ttotal_time\tmax_time\t" "last_change\tprevent_suspend_time\n"); } *srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) { if (n-- <= 0) return ws; } return NULL; } static void *wakeup_sources_stats_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct wakeup_source *ws = v; struct wakeup_source *next_ws = NULL; ++(*pos); list_for_each_entry_continue_rcu(ws, &wakeup_sources, entry) { next_ws = ws; break; } if (!next_ws) print_wakeup_source_stats(m, &deleted_ws); return next_ws; } static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v) { int *srcuidx = m->private; srcu_read_unlock(&wakeup_srcu, *srcuidx); } /** * wakeup_sources_stats_seq_show - Print wakeup sources statistics information. * @m: seq_file to print the statistics into. * @v: wakeup_source of each iteration */ static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v) { struct wakeup_source *ws = v; print_wakeup_source_stats(m, ws); return 0; } static const struct seq_operations wakeup_sources_stats_seq_ops = { .start = wakeup_sources_stats_seq_start, .next = wakeup_sources_stats_seq_next, .stop = wakeup_sources_stats_seq_stop, .show = wakeup_sources_stats_seq_show, }; static int wakeup_sources_stats_open(struct inode *inode, struct file *file) { return seq_open_private(file, &wakeup_sources_stats_seq_ops, sizeof(int)); } static const struct file_operations wakeup_sources_stats_fops = { .owner = THIS_MODULE, .open = wakeup_sources_stats_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; static int __init wakeup_sources_debugfs_init(void) { debugfs_create_file("wakeup_sources", 0444, NULL, NULL, &wakeup_sources_stats_fops); return 0; } postcore_initcall(wakeup_sources_debugfs_init);
12 12 12 12 12 12 12 12 78 60 60 48 60 9 6 9 9 5 58 12 57 3 24 1 25 11 65 65 64 4 1 14 26 27 5 5 5 51 36 41 25 2 27 39 13 17 8 3 14 7 7 2 7 7 7 3 3 45 44 203 225 60 15 33 16 16 50 50 21 22 30 50 50 48 42 6 4 2 46 46 4 50 5 5 4 2 4 2 1 2 2 4 61 3 6 4 51 6 1 5 1 5 6 50 50 44 41 40 41 6 42 5 5 5 5 4 1 4 3 1 3 4 1 3 1 3 4 4 4 45 2 47 1 43 1 2 1 44 44 45 52 52 3 47 49 148 6 142 7 142 148 133 15 11 8 3 2 42 35 1 1 1 33 27 4 2 2 3 15 1 1 1 1 22 7 3 21 23 3 393 30 30 8 155 11 134 144 145 145 145 133 12 47 37 10 42 1 35 7 22 18 20 22 37 4 39 2 39 2 37 4 32 9 29 12 39 3 41 41 38 3 12 28 25 8 6 1 2 1 2 5 1 4 220 266 6 6 7 2 2 6 4 2 4 4 1 253 34 385 89 89 88 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 tunneling device * Linux INET6 implementation * * Authors: * Ville Nuorvala <vnuorval@tcs.hut.fi> * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * * Based on: * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * * RFC 2473 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/etherdevice.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #define IP6_TUNNEL_HASH_SIZE_SHIFT 5 #define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static struct rtnl_link_ops ip6_link_ops __read_mostly; static unsigned int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; struct ip6_tnl __rcu *collect_md_tun; }; static inline int ip6_tnl_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_equal(remote, &t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_any(&t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(remote, &t->parms.raddr) || !ipv6_addr_any(&t->parms.laddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * ip6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * ip6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } /** * ip6_tnl_link - add tunnel to hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } /** * ip6_tnl_unlink - remove tunnel from hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, NULL); for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); } static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); ip6_tnl_link(ip6n, t); return 0; out: return err; } /** * ip6_tnl_create - create a new tunnel * @net: network namespace * @p: tunnel parameters * * Description: * Create tunnel matching given parameters. * * Return: * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err = -E2BIG; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6tnl%%d"); } err = -ENOMEM; dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } /** * ip6_tnl_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * ip6_tnl_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); for (tp = ip6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); return t; } } if (!create) return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } /** * ip6_tnl_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * ip6_tnl_dev_uninit() removes tunnel from its list **/ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } /** * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option * @skb: received socket buffer * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, * else index to encapsulation limit **/ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } if (!pskb_may_pull(skb, off + optlen)) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; if (frag_hdr->frag_off) break; } if (nexthdr == NEXTHDR_DEST) { u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ if (i + sizeof(*tel) > optlen) break; tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; else i++; } } nexthdr = hdr->nexthdr; off += optlen; } return 0; } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /* ip6_tnl_err() should handle errors in the tunnel according to the * specifications in RFC 2473. */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; __u32 rel_info = 0; struct ip6_tnl *t; int err = -ENOENT; int rel_msg = 0; u8 tproto; __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further processing of the error. */ rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; err = 0; switch (*type) { case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: { struct ipv6_tlv_tnl_enc_lim *tel; __u32 teli; teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } } else { net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } break; } case ICMPV6_PKT_TOOBIG: { __u32 mtu; ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; rel_msg = 1; } break; } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); break; } *type = rel_type; *code = rel_code; *info = rel_info; *msg = rel_msg; out: rcu_read_unlock(); return err; } static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); const struct iphdr *eiph; struct sk_buff *skb2; int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg == 0) return 0; switch (rel_type) { case ICMPV6_DEST_UNREACH: if (rel_code != ICMPV6_ADDR_UNREACH) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; break; case ICMPV6_PKT_TOOBIG: if (rel_code != 0) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; default: return 0; } if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) return 0; skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, 0, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { if (rel_info > dst_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); out: kfree_skb(skb2); return 0; } static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { struct rt6_info *rt; struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; icmpv6_send(skb2, rel_type, rel_code, rel_info); ip6_rt_put(rt); kfree_skb(skb2); } return 0; } static int mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); return err; } static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); return IP6_ECN_decapsulate(ipv6h, skb); } static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); return IP6_ECN_decapsulate(ipv6h, skb); } static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { /* ECN is not supported in AF_MPLS */ return 0; } __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ltype = ipv6_addr_type(laddr); int rtype = ipv6_addr_type(raddr); __u32 flags = 0; if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { flags = IP6_TNL_F_CAP_PER_PACKET; } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { if (ltype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_XMIT; if (rtype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_RCV; } return flags; } EXPORT_SYMBOL(ip6_tnl_get_cap); /* called with rcu_read_lock() */ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb), bool log_ecn_err) { const struct ipv6hdr *ipv6h; int nh, err; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } skb->protocol = tpi->proto; /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; skb_reset_mac_header(skb); } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } /* Get the outer header. */ ipv6h = (struct ipv6hdr *)(skb->head + nh); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); if (unlikely(err)) { if (log_ecn_err) net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", &ipv6h->saddr, ipv6_get_dsfield(ipv6h)); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_err) { int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb); dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate; if (tpi->proto == htons(ETH_P_IP)) dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); static const struct tnl_ptk_info tpi_v6 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IPV6), }; static const struct tnl_ptk_info tpi_v4 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IP), }; static const struct tnl_ptk_info tpi_mpls = { /* no tunnel info required for mplsip6. */ .proto = htons(ETH_P_MPLS_UC), }; static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb)) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } rcu_read_unlock(); return ret; drop: rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, ip6ip6_dscp_ecn_decapsulate); } static int mplsip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, mplsip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; }; static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) { memset(opt, 0, sizeof(struct ipv6_tel_txoption)); opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; opt->dst_opt[3] = 1; opt->dst_opt[4] = encap_limit; opt->dst_opt[5] = IPV6_TLV_PADN; opt->dst_opt[6] = 1; opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; opt->ops.opt_nflen = 8; } /** * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if (t->parms.collect_md) return 1; if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { struct net_device *ldev = NULL; rcu_read_lock(); if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else ret = 1; rcu_read_unlock(); } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending * it. * * Return: * 0 on success * -1 fail * %-EMSGSIZE message too big. return mtu in this case. **/ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; __be16 payload_protocol; bool use_cache = false; u8 hop_limit; int err = -1; payload_protocol = skb_protocol(skb, true); if (t->parms.collect_md) { hop_limit = skb_tunnel_info(skb)->key.ttl; goto route_lookup; } else { hop_limit = t->parms.hop_limit; } /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { if (payload_protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; struct neighbour *neigh; int addr_type; if (!skb_dst(skb)) goto tx_err_link_failure; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_err_link_failure; addr6 = (struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) addr6 = &ipv6_hdr(skb)->daddr; memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (payload_protocol == htons(ETH_P_IP)) { const struct rtable *rt = skb_rtable(skb); if (!rt) goto tx_err_link_failure; if (rt->rt_gw_family == AF_INET6) memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr)); } } else if (t->parms.proto != 0 && !(t->parms.flags & (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { /* enable the cache only if neither the outer protocol nor the * routing decision depends on the current inner header value */ use_cache = true; } if (use_cache) dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { route_lookup: /* add dsfield to flowlabel for route lookup */ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); dst = ip6_route_output(net, NULL, fl6); if (dst->error) goto tx_err_link_failure; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6->daddr, 0, &fl6->saddr)) goto tx_err_link_failure; ndst = dst; } tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; } mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } if (t->err_count > 0) { if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) { t->err_count--; dst_link_failure(skb); } else { t->err_count = 0; } } skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom += LL_RESERVED_SPACE(tdev); if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb; new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto tx_err_dst_release; if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } if (t->parms.collect_md) { if (t->encap.type != TUNNEL_ENCAP_NONE) goto tx_err_dst_release; } else { if (use_cache && ndst) dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); } skb_dst_set(skb, dst); if (hop_limit == 0) { if (payload_protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) hop_limit = ipv6_hdr(skb)->hop_limit; else hop_limit = ip6_dst_hoplimit(dst); } /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; __u16 offset; struct flowi6 fl6; __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; tproto = READ_ONCE(t->parms.proto); if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); break; default: orig_dsfield = dsfield; break; } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; if (protocol == IPPROTO_IPV6) { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have * reallocated skb->head */ if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (void *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = protocol; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); break; default: orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); break; } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; skb_set_inner_ipproto(skb, protocol); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) switch (protocol) { case IPPROTO_IPIP: icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); break; case IPPROTO_IPV6: icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); break; default: break; } return -1; } return 0; } static netdev_tx_t ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; ipproto = IPPROTO_IPV6; break; case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; default: goto tx_err; } ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; int t_hlen; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ fl6->saddr = p->laddr; fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) { tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { dev->needed_headroom = tdev->hard_header_len + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) mtu -= 8; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); } } } /** * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * * Description: * ip6_tnl_change() updates the tunnel parameters **/ static void ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); } static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); ip6_tnl_unlink(ip6n, t); synchronize_net(); ip6_tnl_change(t, p); ip6_tnl_link(ip6n, t); netdev_state_change(t->dev); } static void ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { /* for default tnl0 device allow to change only the proto */ t->parms.proto = p->proto; netdev_state_change(t->dev); } static void ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->flags = u->flags; p->hop_limit = u->hop_limit; p->encap_limit = u->encap_limit; p->flowinfo = u->flowinfo; p->link = u->link; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->flags = p->flags; u->hop_limit = p->hop_limit; u->encap_limit = p->encap_limit; u->flowinfo = p->flowinfo; u->link = p->link; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * ip6_tnl_ioctl() is used for managing IPv6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6tnl0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) ip6_tnl0_update(t, &p1); else ip6_tnl_update(t, &p1); } if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { err = PTR_ERR(t); } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } /** * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu * * Return: * 0 on success, * %-EINVAL if mtu too small **/ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); int t_hlen; t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip6_tnl_encap_add_ops); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip6_tnl_encap_del_ops); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip6_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; dev->hw_features |= IPXIPX_FEATURES; /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * ip6_tnl_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int ret; int t_hlen; t->dev = dev; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) goto destroy_dst; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; destroy_dst: dst_cache_destroy(&t->dst_cache); return ret; } /** * ip6_tnl_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = ip6_tnl_dev_init_gen(dev); if (err) return err; ip6_tnl_link_config(t); if (t->parms.collect_md) netif_keep_dst(dev); return 0; } /** * ip6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPV6 && proto != IPPROTO_IPIP && proto != 0) return -EINVAL; return 0; } static void ip6_tnl_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); if (data[IFLA_IPTUN_ENCAP_LIMIT]) parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); if (data[IFLA_IPTUN_FLOWINFO]) parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); if (data[IFLA_IPTUN_FLAGS]) parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; if (data[IFLA_IPTUN_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ip6_tnl_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; struct net *net; int err; net = params->link_net ? : dev_net(dev); ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &nt->parms); if (nt->parms.collect_md) { if (rtnl_dereference(ip6n->collect_md_tun)) return -EEXIST; } else { t = ip6_tnl_locate(net, &nt->parms, 0); if (!IS_ERR(t)) return -EEXIST; } err = ip6_tnl_create2(dev); if (!err && tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return err; } static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; if (dev == ip6n->fb_tnl_dev) return -EINVAL; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &p); if (p.collect_md) return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ip6_tnl_update(t, &p); return 0; } static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static size_t ip6_tnl_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_IPTUN_FLOWINFO */ nla_total_size(4) + /* IFLA_IPTUN_FLAGS */ nla_total_size(4) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { .kind = "ip6tnl", .maxtype = IFLA_IPTUN_MAX, .policy = ip6_tnl_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6_tnl_dev_setup, .validate = ip6_tnl_validate, .newlink = ip6_tnl_newlink, .changelink = ip6_tnl_changelink, .dellink = ip6_tnl_dellink, .get_size = ip6_tnl_get_size, .fill_info = ip6_tnl_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .handler = mplsip6_rcv, .err_handler = mplsip6_err, .priority = 1, }; static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) ip6_tnl_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; /** * ip6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init ip6_tunnel_init(void) { int err; if (!ipv6_mod_enabled()) return -EOPNOTSUPP; err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); if (err < 0) { pr_err("%s: can't register ip4ip6\n", __func__); goto out_ip4ip6; } err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); if (err < 0) { pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } if (ip6_tnl_mpls_supported()) { err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); if (err < 0) { pr_err("%s: can't register mplsip6\n", __func__); goto out_mplsip6; } } err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: if (ip6_tnl_mpls_supported()) xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); out_ip4ip6: unregister_pernet_device(&ip6_tnl_net_ops); out_pernet: return err; } /** * ip6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit ip6_tunnel_cleanup(void) { rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); if (ip6_tnl_mpls_supported() && xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } module_init(ip6_tunnel_init); module_exit(ip6_tunnel_cleanup);
7 14 7 2 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NetLabel Network Address Lists * * This file contains network address list functions used to manage ordered * lists of network addresses for use by the NetLabel subsystem. The NetLabel * system manages static and dynamic label mappings for network protocols such * as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 */ #ifndef _NETLABEL_ADDRLIST_H #define _NETLABEL_ADDRLIST_H #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/in6.h> #include <linux/audit.h> /** * struct netlbl_af4list - NetLabel IPv4 address list * @addr: IPv4 address * @mask: IPv4 address mask * @valid: valid flag * @list: list structure, used internally */ struct netlbl_af4list { __be32 addr; __be32 mask; u32 valid; struct list_head list; }; /** * struct netlbl_af6list - NetLabel IPv6 address list * @addr: IPv6 address * @mask: IPv6 address mask * @valid: valid flag * @list: list structure, used internally */ struct netlbl_af6list { struct in6_addr addr; struct in6_addr mask; u32 valid; struct list_head list; }; #define __af4list_entry(ptr) container_of(ptr, struct netlbl_af4list, list) static inline struct netlbl_af4list *__af4list_valid(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { i = i->next; n = __af4list_entry(i); } return n; } static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { i = rcu_dereference(list_next_rcu(i)); n = __af4list_entry(i); } return n; } #define netlbl_af4list_foreach(iter, head) \ for (iter = __af4list_valid((head)->next, head); \ &iter->list != (head); \ iter = __af4list_valid(iter->list.next, head)) #define netlbl_af4list_foreach_rcu(iter, head) \ for (iter = __af4list_valid_rcu((head)->next, head); \ &iter->list != (head); \ iter = __af4list_valid_rcu(iter->list.next, head)) #define netlbl_af4list_foreach_safe(iter, tmp, head) \ for (iter = __af4list_valid((head)->next, head), \ tmp = __af4list_valid(iter->list.next, head); \ &iter->list != (head); \ iter = tmp, tmp = __af4list_valid(iter->list.next, head)) int netlbl_af4list_add(struct netlbl_af4list *entry, struct list_head *head); struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask, struct list_head *head); void netlbl_af4list_remove_entry(struct netlbl_af4list *entry); struct netlbl_af4list *netlbl_af4list_search(__be32 addr, struct list_head *head); struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr, __be32 mask, struct list_head *head); #ifdef CONFIG_AUDIT void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, __be32 addr, __be32 mask); #else static inline void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, __be32 addr, __be32 mask) { } #endif #if IS_ENABLED(CONFIG_IPV6) #define __af6list_entry(ptr) container_of(ptr, struct netlbl_af6list, list) static inline struct netlbl_af6list *__af6list_valid(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { i = i->next; n = __af6list_entry(i); } return n; } static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { i = rcu_dereference(list_next_rcu(i)); n = __af6list_entry(i); } return n; } #define netlbl_af6list_foreach(iter, head) \ for (iter = __af6list_valid((head)->next, head); \ &iter->list != (head); \ iter = __af6list_valid(iter->list.next, head)) #define netlbl_af6list_foreach_rcu(iter, head) \ for (iter = __af6list_valid_rcu((head)->next, head); \ &iter->list != (head); \ iter = __af6list_valid_rcu(iter->list.next, head)) #define netlbl_af6list_foreach_safe(iter, tmp, head) \ for (iter = __af6list_valid((head)->next, head), \ tmp = __af6list_valid(iter->list.next, head); \ &iter->list != (head); \ iter = tmp, tmp = __af6list_valid(iter->list.next, head)) int netlbl_af6list_add(struct netlbl_af6list *entry, struct list_head *head); struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr, const struct in6_addr *mask, struct list_head *head); void netlbl_af6list_remove_entry(struct netlbl_af6list *entry); struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr, struct list_head *head); struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr, const struct in6_addr *mask, struct list_head *head); #ifdef CONFIG_AUDIT void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, const struct in6_addr *addr, const struct in6_addr *mask); #else static inline void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, const struct in6_addr *addr, const struct in6_addr *mask) { } #endif #endif /* IPV6 */ #endif
6 2 2 1 3 3 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge Multiple Spanning Tree Support * * Authors: * Tobias Waldekranz <tobias@waldekranz.com> */ #include <linux/kernel.h> #include <net/switchdev.h> #include "br_private.h" DEFINE_STATIC_KEY_FALSE(br_mst_used); bool br_mst_enabled(const struct net_device *dev) { if (!netif_is_bridge_master(dev)) return false; return br_opt_get(netdev_priv(dev), BROPT_MST_ENABLED); } EXPORT_SYMBOL_GPL(br_mst_enabled); int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids) { const struct net_bridge_vlan_group *vg; const struct net_bridge_vlan *v; const struct net_bridge *br; ASSERT_RTNL(); if (!netif_is_bridge_master(dev)) return -EINVAL; br = netdev_priv(dev); if (!br_opt_get(br, BROPT_MST_ENABLED)) return -EINVAL; vg = br_vlan_group(br); list_for_each_entry(v, &vg->vlan_list, vlist) { if (v->msti == msti) __set_bit(v->vid, vids); } return 0; } EXPORT_SYMBOL_GPL(br_mst_get_info); int br_mst_get_state(const struct net_device *dev, u16 msti, u8 *state) { const struct net_bridge_port *p = NULL; const struct net_bridge_vlan_group *vg; const struct net_bridge_vlan *v; ASSERT_RTNL(); p = br_port_get_check_rtnl(dev); if (!p || !br_opt_get(p->br, BROPT_MST_ENABLED)) return -EINVAL; vg = nbp_vlan_group(p); list_for_each_entry(v, &vg->vlan_list, vlist) { if (v->brvlan->msti == msti) { *state = v->state; return 0; } } return -ENOENT; } EXPORT_SYMBOL_GPL(br_mst_get_state); static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *v, u8 state) { if (br_vlan_get_state(v) == state) return; br_vlan_set_state(v, state); if (v->vid == vg->pvid) br_vlan_set_pvid_state(vg, state); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .id = SWITCHDEV_ATTR_ID_PORT_MST_STATE, .orig_dev = p->dev, .u.mst_state = { .msti = msti, .state = state, }, }; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; int err = 0; rcu_read_lock(); vg = nbp_vlan_group_rcu(p); if (!vg) goto out; /* MSTI 0 (CST) state changes are notified via the regular * SWITCHDEV_ATTR_ID_PORT_STP_STATE. */ if (msti) { err = switchdev_port_attr_set(p->dev, &attr, extack); if (err && err != -EOPNOTSUPP) goto out; } err = 0; list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (v->brvlan->msti != msti) continue; br_mst_vlan_set_state(vg, v, state); } out: rcu_read_unlock(); return err; } static void br_mst_vlan_sync_state(struct net_bridge_vlan *pv, u16 msti) { struct net_bridge_vlan_group *vg = nbp_vlan_group(pv->port); struct net_bridge_vlan *v; list_for_each_entry(v, &vg->vlan_list, vlist) { /* If this port already has a defined state in this * MSTI (through some other VLAN membership), inherit * it. */ if (v != pv && v->brvlan->msti == msti) { br_mst_vlan_set_state(vg, pv, v->state); return; } } /* Otherwise, start out in a new MSTI with all ports disabled. */ return br_mst_vlan_set_state(vg, pv, BR_STATE_DISABLED); } int br_mst_vlan_set_msti(struct net_bridge_vlan *mv, u16 msti) { struct switchdev_attr attr = { .id = SWITCHDEV_ATTR_ID_VLAN_MSTI, .orig_dev = mv->br->dev, .u.vlan_msti = { .vid = mv->vid, .msti = msti, }, }; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *pv; struct net_bridge_port *p; int err; if (mv->msti == msti) return 0; err = switchdev_port_attr_set(mv->br->dev, &attr, NULL); if (err && err != -EOPNOTSUPP) return err; mv->msti = msti; list_for_each_entry(p, &mv->br->port_list, list) { vg = nbp_vlan_group(p); pv = br_vlan_find(vg, mv->vid); if (pv) br_mst_vlan_sync_state(pv, msti); } return 0; } void br_mst_vlan_init_state(struct net_bridge_vlan *v) { /* VLANs always start out in MSTI 0 (CST) */ v->msti = 0; if (br_vlan_is_master(v)) v->state = BR_STATE_FORWARDING; else v->state = v->port->state; } int br_mst_set_enabled(struct net_bridge *br, bool on, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .id = SWITCHDEV_ATTR_ID_BRIDGE_MST, .orig_dev = br->dev, .u.mst = on, }; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; int err; list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); if (!vg->num_vlans) continue; NL_SET_ERR_MSG(extack, "MST mode can't be changed while VLANs exist"); return -EBUSY; } if (br_opt_get(br, BROPT_MST_ENABLED) == on) return 0; err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; if (on) static_branch_enable(&br_mst_used); else static_branch_disable(&br_mst_used); br_opt_toggle(br, BROPT_MST_ENABLED, on); return 0; } size_t br_mst_info_size(const struct net_bridge_vlan_group *vg) { DECLARE_BITMAP(seen, VLAN_N_VID) = { 0 }; const struct net_bridge_vlan *v; size_t sz; /* IFLA_BRIDGE_MST */ sz = nla_total_size(0); list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (test_bit(v->brvlan->msti, seen)) continue; /* IFLA_BRIDGE_MST_ENTRY */ sz += nla_total_size(0) + /* IFLA_BRIDGE_MST_ENTRY_MSTI */ nla_total_size(sizeof(u16)) + /* IFLA_BRIDGE_MST_ENTRY_STATE */ nla_total_size(sizeof(u8)); __set_bit(v->brvlan->msti, seen); } return sz; } int br_mst_fill_info(struct sk_buff *skb, const struct net_bridge_vlan_group *vg) { DECLARE_BITMAP(seen, VLAN_N_VID) = { 0 }; const struct net_bridge_vlan *v; struct nlattr *nest; int err = 0; list_for_each_entry(v, &vg->vlan_list, vlist) { if (test_bit(v->brvlan->msti, seen)) continue; nest = nla_nest_start_noflag(skb, IFLA_BRIDGE_MST_ENTRY); if (!nest || nla_put_u16(skb, IFLA_BRIDGE_MST_ENTRY_MSTI, v->brvlan->msti) || nla_put_u8(skb, IFLA_BRIDGE_MST_ENTRY_STATE, v->state)) { err = -EMSGSIZE; break; } nla_nest_end(skb, nest); __set_bit(v->brvlan->msti, seen); } return err; } static const struct nla_policy br_mst_nl_policy[IFLA_BRIDGE_MST_ENTRY_MAX + 1] = { [IFLA_BRIDGE_MST_ENTRY_MSTI] = NLA_POLICY_RANGE(NLA_U16, 1, /* 0 reserved for CST */ VLAN_N_VID - 1), [IFLA_BRIDGE_MST_ENTRY_STATE] = NLA_POLICY_RANGE(NLA_U8, BR_STATE_DISABLED, BR_STATE_BLOCKING), }; static int br_mst_process_one(struct net_bridge_port *p, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_MST_ENTRY_MAX + 1]; u16 msti; u8 state; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_MST_ENTRY_MAX, attr, br_mst_nl_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_MST_ENTRY_MSTI]) { NL_SET_ERR_MSG_MOD(extack, "MSTI not specified"); return -EINVAL; } if (!tb[IFLA_BRIDGE_MST_ENTRY_STATE]) { NL_SET_ERR_MSG_MOD(extack, "State not specified"); return -EINVAL; } msti = nla_get_u16(tb[IFLA_BRIDGE_MST_ENTRY_MSTI]); state = nla_get_u8(tb[IFLA_BRIDGE_MST_ENTRY_STATE]); return br_mst_set_state(p, msti, state, extack); } int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, struct netlink_ext_ack *extack) { struct nlattr *attr; int err, msts = 0; int rem; if (!br_opt_get(p->br, BROPT_MST_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Can't modify MST state when MST is disabled"); return -EBUSY; } nla_for_each_nested(attr, mst_attr, rem) { switch (nla_type(attr)) { case IFLA_BRIDGE_MST_ENTRY: err = br_mst_process_one(p, attr, extack); break; default: continue; } msts++; if (err) break; } if (!msts) { NL_SET_ERR_MSG_MOD(extack, "Found no MST entries to process"); err = -EINVAL; } return err; }
2 2 2 2 2 11 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 // SPDX-License-Identifier: GPL-2.0 /* * Tty buffer allocation management */ #include <linux/types.h> #include <linux/errno.h> #include <linux/minmax.h> #include <linux/tty.h> #include <linux/tty_buffer.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/ratelimit.h> #include "tty.h" #define MIN_TTYB_SIZE 256 #define TTYB_ALIGN_MASK 0xff /* * Byte threshold to limit memory consumption for flip buffers. * The actual memory limit is > 2x this amount. */ #define TTYB_DEFAULT_MEM_LIMIT (640 * 1024UL) /* * We default to dicing tty buffer allocations to this many characters * in order to avoid multiple page allocations. We know the size of * tty_buffer itself but it must also be taken into account that the * buffer is 256 byte aligned. See tty_buffer_find for the allocation * logic this must match. */ #define TTY_BUFFER_PAGE (((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~TTYB_ALIGN_MASK) /** * tty_buffer_lock_exclusive - gain exclusive access to buffer * @port: tty port owning the flip buffer * * Guarantees safe use of the &tty_ldisc_ops.receive_buf() method by excluding * the buffer work and any pending flush from using the flip buffer. Data can * continue to be added concurrently to the flip buffer from the driver side. * * See also tty_buffer_unlock_exclusive(). */ void tty_buffer_lock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; atomic_inc(&buf->priority); mutex_lock(&buf->lock); } EXPORT_SYMBOL_GPL(tty_buffer_lock_exclusive); /** * tty_buffer_unlock_exclusive - release exclusive access * @port: tty port owning the flip buffer * * The buffer work is restarted if there is data in the flip buffer. * * See also tty_buffer_lock_exclusive(). */ void tty_buffer_unlock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; bool restart = buf->head->commit != buf->head->read; atomic_dec(&buf->priority); mutex_unlock(&buf->lock); if (restart) queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL_GPL(tty_buffer_unlock_exclusive); /** * tty_buffer_space_avail - return unused buffer space * @port: tty port owning the flip buffer * * Returns: the # of bytes which can be written by the driver without reaching * the buffer limit. * * Note: this does not guarantee that memory is available to write the returned * # of bytes (use tty_prepare_flip_string() to pre-allocate if memory * guarantee is required). */ unsigned int tty_buffer_space_avail(struct tty_port *port) { int space = port->buf.mem_limit - atomic_read(&port->buf.mem_used); return max(space, 0); } EXPORT_SYMBOL_GPL(tty_buffer_space_avail); static void tty_buffer_reset(struct tty_buffer *p, size_t size) { p->used = 0; p->size = size; p->next = NULL; p->commit = 0; p->lookahead = 0; p->read = 0; p->flags = true; } /** * tty_buffer_free_all - free buffers used by a tty * @port: tty port to free from * * Remove all the buffers pending on a tty whether queued with data or in the * free ring. Must be called when the tty is no longer in use. */ void tty_buffer_free_all(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *p, *next; struct llist_node *llist; unsigned int freed = 0; int still_used; while ((p = buf->head) != NULL) { buf->head = p->next; freed += p->size; if (p->size > 0) kfree(p); } llist = llist_del_all(&buf->free); llist_for_each_entry_safe(p, next, llist, free) kfree(p); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; still_used = atomic_xchg(&buf->mem_used, 0); WARN(still_used != freed, "we still have not freed %d bytes!", still_used - freed); } /** * tty_buffer_alloc - allocate a tty buffer * @port: tty port * @size: desired size (characters) * * Allocate a new tty buffer to hold the desired number of characters. We * round our buffers off in 256 character chunks to get better allocation * behaviour. * * Returns: %NULL if out of memory or the allocation would exceed the per * device queue. */ static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size) { struct llist_node *free; struct tty_buffer *p; /* Round the buffer size out */ size = __ALIGN_MASK(size, TTYB_ALIGN_MASK); if (size <= MIN_TTYB_SIZE) { free = llist_del_first(&port->buf.free); if (free) { p = llist_entry(free, struct tty_buffer, free); goto found; } } /* Should possibly check if this fails for the largest buffer we * have queued and recycle that ? */ if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit) return NULL; p = kmalloc(struct_size(p, data, 2 * size), GFP_ATOMIC | __GFP_NOWARN); if (p == NULL) return NULL; found: tty_buffer_reset(p, size); atomic_add(size, &port->buf.mem_used); return p; } /** * tty_buffer_free - free a tty buffer * @port: tty port owning the buffer * @b: the buffer to free * * Free a tty buffer, or add it to the free list according to our internal * strategy. */ static void tty_buffer_free(struct tty_port *port, struct tty_buffer *b) { struct tty_bufhead *buf = &port->buf; /* Dumb strategy for now - should keep some stats */ WARN_ON(atomic_sub_return(b->size, &buf->mem_used) < 0); if (b->size > MIN_TTYB_SIZE) kfree(b); else if (b->size > 0) llist_add(&b->free, &buf->free); } /** * tty_buffer_flush - flush full tty buffers * @tty: tty to flush * @ld: optional ldisc ptr (must be referenced) * * Flush all the buffers containing receive data. If @ld != %NULL, flush the * ldisc input buffer. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld) { struct tty_port *port = tty->port; struct tty_bufhead *buf = &port->buf; struct tty_buffer *next; atomic_inc(&buf->priority); mutex_lock(&buf->lock); /* paired w/ release in __tty_buffer_request_room; ensures there are * no pending memory accesses to the freed buffer */ while ((next = smp_load_acquire(&buf->head->next)) != NULL) { tty_buffer_free(port, buf->head); buf->head = next; } buf->head->read = buf->head->commit; buf->head->lookahead = buf->head->read; if (ld && ld->ops->flush_buffer) ld->ops->flush_buffer(tty); atomic_dec(&buf->priority); mutex_unlock(&buf->lock); } /** * __tty_buffer_request_room - grow tty buffer if needed * @port: tty port * @size: size desired * @flags: buffer has to store flags along character data * * Make at least @size bytes of linear space available for the tty buffer. * * Will change over to a new buffer if the current buffer is encoded as * %TTY_NORMAL (so has no flags buffer) and the new buffer requires a flags * buffer. * * Returns: the size we managed to find. */ static int __tty_buffer_request_room(struct tty_port *port, size_t size, bool flags) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *n, *b = buf->tail; size_t left = (b->flags ? 1 : 2) * b->size - b->used; bool change = !b->flags && flags; if (!change && left >= size) return size; /* This is the slow path - looking for new buffers to use */ n = tty_buffer_alloc(port, size); if (n == NULL) return change ? 0 : left; n->flags = flags; buf->tail = n; /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures they see all buffer data. */ smp_store_release(&b->commit, b->used); /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures the latest commit value can be read before the head * is advanced to the next buffer. */ smp_store_release(&b->next, n); return size; } int tty_buffer_request_room(struct tty_port *port, size_t size) { return __tty_buffer_request_room(port, size, true); } EXPORT_SYMBOL_GPL(tty_buffer_request_room); size_t __tty_insert_flip_string_flags(struct tty_port *port, const u8 *chars, const u8 *flags, bool mutable_flags, size_t size) { bool need_flags = mutable_flags || flags[0] != TTY_NORMAL; size_t copied = 0; do { size_t goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE); size_t space = __tty_buffer_request_room(port, goal, need_flags); struct tty_buffer *tb = port->buf.tail; if (unlikely(space == 0)) break; memcpy(char_buf_ptr(tb, tb->used), chars, space); if (mutable_flags) { memcpy(flag_buf_ptr(tb, tb->used), flags, space); flags += space; } else if (tb->flags) { memset(flag_buf_ptr(tb, tb->used), flags[0], space); } else { /* tb->flags should be available once requested */ WARN_ON_ONCE(need_flags); } tb->used += space; copied += space; chars += space; /* There is a small chance that we need to split the data over * several buffers. If this is the case we must loop. */ } while (unlikely(size > copied)); return copied; } EXPORT_SYMBOL(__tty_insert_flip_string_flags); /** * tty_prepare_flip_string - make room for characters * @port: tty port * @chars: return pointer for character write area * @size: desired size * * Prepare a block of space in the buffer for data. * * This is used for drivers that need their own block copy routines into the * buffer. There is no guarantee the buffer is a DMA target! * * Returns: the length available and buffer pointer (@chars) to the space which * is now allocated and accounted for as ready for normal characters. */ size_t tty_prepare_flip_string(struct tty_port *port, u8 **chars, size_t size) { size_t space = __tty_buffer_request_room(port, size, false); if (likely(space)) { struct tty_buffer *tb = port->buf.tail; *chars = char_buf_ptr(tb, tb->used); if (tb->flags) memset(flag_buf_ptr(tb, tb->used), TTY_NORMAL, space); tb->used += space; } return space; } EXPORT_SYMBOL_GPL(tty_prepare_flip_string); /** * tty_ldisc_receive_buf - forward data to line discipline * @ld: line discipline to process input * @p: char buffer * @f: %TTY_NORMAL, %TTY_BREAK, etc. flags buffer * @count: number of bytes to process * * Callers other than flush_to_ldisc() need to exclude the kworker from * concurrent use of the line discipline, see paste_selection(). * * Returns: the number of bytes processed. */ size_t tty_ldisc_receive_buf(struct tty_ldisc *ld, const u8 *p, const u8 *f, size_t count) { if (ld->ops->receive_buf2) count = ld->ops->receive_buf2(ld->tty, p, f, count); else { count = min_t(size_t, count, ld->tty->receive_room); if (count && ld->ops->receive_buf) ld->ops->receive_buf(ld->tty, p, f, count); } return count; } EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf); static void lookahead_bufs(struct tty_port *port, struct tty_buffer *head) { head->lookahead = max(head->lookahead, head->read); while (head) { struct tty_buffer *next; unsigned int count; /* * Paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer. */ next = smp_load_acquire(&head->next); /* * Paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data. */ count = smp_load_acquire(&head->commit) - head->lookahead; if (!count) { head = next; continue; } if (port->client_ops->lookahead_buf) { u8 *p, *f = NULL; p = char_buf_ptr(head, head->lookahead); if (head->flags) f = flag_buf_ptr(head, head->lookahead); port->client_ops->lookahead_buf(port, p, f, count); } head->lookahead += count; } } static size_t receive_buf(struct tty_port *port, struct tty_buffer *head, size_t count) { u8 *p = char_buf_ptr(head, head->read); const u8 *f = NULL; size_t n; if (head->flags) f = flag_buf_ptr(head, head->read); n = port->client_ops->receive_buf(port, p, f, count); if (n > 0) memset(p, 0, n); return n; } /** * flush_to_ldisc - flush data from buffer to ldisc * @work: tty structure passed from work queue. * * This routine is called out of the software interrupt to flush data from the * buffer chain to the line discipline. * * The receive_buf() method is single threaded for each tty instance. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ static void flush_to_ldisc(struct work_struct *work) { struct tty_port *port = container_of(work, struct tty_port, buf.work); struct tty_bufhead *buf = &port->buf; mutex_lock(&buf->lock); while (1) { struct tty_buffer *head = buf->head; struct tty_buffer *next; size_t count, rcvd; /* Ldisc or user is trying to gain exclusive access */ if (atomic_read(&buf->priority)) break; /* paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer */ next = smp_load_acquire(&head->next); /* paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data */ count = smp_load_acquire(&head->commit) - head->read; if (!count) { if (next == NULL) break; buf->head = next; tty_buffer_free(port, head); continue; } rcvd = receive_buf(port, head, count); head->read += rcvd; if (rcvd < count) lookahead_bufs(port, head); if (!rcvd) break; if (need_resched()) cond_resched(); } mutex_unlock(&buf->lock); } static inline void tty_flip_buffer_commit(struct tty_buffer *tail) { /* * Paired w/ acquire in flush_to_ldisc(); ensures flush_to_ldisc() sees * buffer data. */ smp_store_release(&tail->commit, tail->used); } /** * tty_flip_buffer_push - push terminal buffers * @port: tty port to push * * Queue a push of the terminal flip buffers to the line discipline. Can be * called from IRQ/atomic context. * * In the event of the queue being busy for flipping the work will be held off * and retried later. */ void tty_flip_buffer_push(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; tty_flip_buffer_commit(buf->tail); queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL(tty_flip_buffer_push); /** * tty_insert_flip_string_and_push_buffer - add characters to the tty buffer and * push * @port: tty port * @chars: characters * @size: size * * The function combines tty_insert_flip_string() and tty_flip_buffer_push() * with the exception of properly holding the @port->lock. * * To be used only internally (by pty currently). * * Returns: the number added. */ int tty_insert_flip_string_and_push_buffer(struct tty_port *port, const u8 *chars, size_t size) { struct tty_bufhead *buf = &port->buf; unsigned long flags; spin_lock_irqsave(&port->lock, flags); size = tty_insert_flip_string(port, chars, size); if (size) tty_flip_buffer_commit(buf->tail); spin_unlock_irqrestore(&port->lock, flags); queue_work(system_unbound_wq, &buf->work); return size; } /** * tty_buffer_init - prepare a tty buffer structure * @port: tty port to initialise * * Set up the initial state of the buffer management for a tty device. Must be * called before the other tty buffer functions are used. */ void tty_buffer_init(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; mutex_init(&buf->lock); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; init_llist_head(&buf->free); atomic_set(&buf->mem_used, 0); atomic_set(&buf->priority, 0); INIT_WORK(&buf->work, flush_to_ldisc); buf->mem_limit = TTYB_DEFAULT_MEM_LIMIT; } /** * tty_buffer_set_limit - change the tty buffer memory limit * @port: tty port to change * @limit: memory limit to set * * Change the tty buffer memory limit. * * Must be called before the other tty buffer functions are used. */ int tty_buffer_set_limit(struct tty_port *port, int limit) { if (limit < MIN_TTYB_SIZE) return -EINVAL; port->buf.mem_limit = limit; return 0; } EXPORT_SYMBOL_GPL(tty_buffer_set_limit); /* slave ptys can claim nested buffer lock when handling BRK and INTR */ void tty_buffer_set_lock_subclass(struct tty_port *port) { lockdep_set_subclass(&port->buf.lock, TTY_LOCK_SLAVE); } bool tty_buffer_restart_work(struct tty_port *port) { return queue_work(system_unbound_wq, &port->buf.work); } bool tty_buffer_cancel_work(struct tty_port *port) { return cancel_work_sync(&port->buf.work); } void tty_buffer_flush_work(struct tty_port *port) { flush_work(&port->buf.work); }
13 2 2 3 3 3 3 3 11 11 4 1 1 1 1 3 3 3 3 3 3 3 4 1 1 1 1 2 4 3 1 11 11 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" struct devlink_region { struct devlink *devlink; struct devlink_port *port; struct list_head list; union { const struct devlink_region_ops *ops; const struct devlink_port_region_ops *port_ops; }; struct mutex snapshot_lock; /* protects snapshot_list, * max_snapshots and cur_snapshots * consistency. */ struct list_head snapshot_list; u32 max_snapshots; u32 cur_snapshots; u64 size; }; struct devlink_snapshot { struct list_head list; struct devlink_region *region; u8 *data; u32 id; }; static struct devlink_region * devlink_region_get_by_name(struct devlink *devlink, const char *region_name) { struct devlink_region *region; list_for_each_entry(region, &devlink->region_list, list) if (!strcmp(region->ops->name, region_name)) return region; return NULL; } static struct devlink_region * devlink_port_region_get_by_name(struct devlink_port *port, const char *region_name) { struct devlink_region *region; list_for_each_entry(region, &port->region_list, list) if (!strcmp(region->ops->name, region_name)) return region; return NULL; } static struct devlink_snapshot * devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) { struct devlink_snapshot *snapshot; list_for_each_entry(snapshot, &region->snapshot_list, list) if (snapshot->id == id) return snapshot; return NULL; } static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, struct devlink *devlink, struct devlink_snapshot *snapshot) { struct nlattr *snap_attr; int err; snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT); if (!snap_attr) return -EMSGSIZE; err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); if (err) goto nla_put_failure; nla_nest_end(msg, snap_attr); return 0; nla_put_failure: nla_nest_cancel(msg, snap_attr); return err; } static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, struct devlink *devlink, struct devlink_region *region) { struct devlink_snapshot *snapshot; struct nlattr *snapshots_attr; int err; snapshots_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOTS); if (!snapshots_attr) return -EMSGSIZE; list_for_each_entry(snapshot, &region->snapshot_list, list) { err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot); if (err) goto nla_put_failure; } nla_nest_end(msg, snapshots_attr); return 0; nla_put_failure: nla_nest_cancel(msg, snapshots_attr); return err; } static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct devlink_region *region) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; err = devlink_nl_put_handle(msg, devlink); if (err) goto nla_put_failure; if (region->port) { err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, region->port->index); if (err) goto nla_put_failure; } err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) goto nla_put_failure; err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_SIZE, region->size); if (err) goto nla_put_failure; err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS, region->max_snapshots); if (err) goto nla_put_failure; err = devlink_nl_region_snapshots_id_put(msg, devlink, region); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return err; } static struct sk_buff * devlink_nl_region_notify_build(struct devlink_region *region, struct devlink_snapshot *snapshot, enum devlink_command cmd, u32 portid, u32 seq) { struct devlink *devlink = region->devlink; struct sk_buff *msg; void *hdr; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return ERR_PTR(-ENOMEM); hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd); if (!hdr) { err = -EMSGSIZE; goto out_free_msg; } err = devlink_nl_put_handle(msg, devlink); if (err) goto out_cancel_msg; if (region->port) { err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, region->port->index); if (err) goto out_cancel_msg; } err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) goto out_cancel_msg; if (snapshot) { err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); if (err) goto out_cancel_msg; } else { err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_SIZE, region->size); if (err) goto out_cancel_msg; } genlmsg_end(msg, hdr); return msg; out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); return ERR_PTR(err); } static void devlink_nl_region_notify(struct devlink_region *region, struct devlink_snapshot *snapshot, enum devlink_command cmd) { struct devlink *devlink = region->devlink; struct sk_buff *msg; WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); if (IS_ERR(msg)) return; devlink_nl_notify_send(devlink, msg); } void devlink_regions_notify_register(struct devlink *devlink) { struct devlink_region *region; list_for_each_entry(region, &devlink->region_list, list) devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); } void devlink_regions_notify_unregister(struct devlink *devlink) { struct devlink_region *region; list_for_each_entry_reverse(region, &devlink->region_list, list) devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); } /** * __devlink_snapshot_id_increment - Increment number of snapshots using an id * @devlink: devlink instance * @id: the snapshot id * * Track when a new snapshot begins using an id. Load the count for the * given id from the snapshot xarray, increment it, and store it back. * * Called when a new snapshot is created with the given id. * * The id *must* have been previously allocated by * devlink_region_snapshot_id_get(). * * Returns 0 on success, or an error on failure. */ static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id) { unsigned long count; void *p; int err; xa_lock(&devlink->snapshot_ids); p = xa_load(&devlink->snapshot_ids, id); if (WARN_ON(!p)) { err = -EINVAL; goto unlock; } if (WARN_ON(!xa_is_value(p))) { err = -EINVAL; goto unlock; } count = xa_to_value(p); count++; err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), GFP_ATOMIC)); unlock: xa_unlock(&devlink->snapshot_ids); return err; } /** * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id * @devlink: devlink instance * @id: the snapshot id * * Track when a snapshot is deleted and stops using an id. Load the count * for the given id from the snapshot xarray, decrement it, and store it * back. * * If the count reaches zero, erase this id from the xarray, freeing it * up for future re-use by devlink_region_snapshot_id_get(). * * Called when a snapshot using the given id is deleted, and when the * initial allocator of the id is finished using it. */ static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id) { unsigned long count; void *p; xa_lock(&devlink->snapshot_ids); p = xa_load(&devlink->snapshot_ids, id); if (WARN_ON(!p)) goto unlock; if (WARN_ON(!xa_is_value(p))) goto unlock; count = xa_to_value(p); if (count > 1) { count--; __xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), GFP_ATOMIC); } else { /* If this was the last user, we can erase this id */ __xa_erase(&devlink->snapshot_ids, id); } unlock: xa_unlock(&devlink->snapshot_ids); } /** * __devlink_snapshot_id_insert - Insert a specific snapshot ID * @devlink: devlink instance * @id: the snapshot id * * Mark the given snapshot id as used by inserting a zero value into the * snapshot xarray. * * This must be called while holding the devlink instance lock. Unlike * devlink_snapshot_id_get, the initial reference count is zero, not one. * It is expected that the id will immediately be used before * releasing the devlink instance lock. * * Returns zero on success, or an error code if the snapshot id could not * be inserted. */ static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) { int err; xa_lock(&devlink->snapshot_ids); if (xa_load(&devlink->snapshot_ids, id)) { xa_unlock(&devlink->snapshot_ids); return -EEXIST; } err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), GFP_ATOMIC)); xa_unlock(&devlink->snapshot_ids); return err; } /** * __devlink_region_snapshot_id_get - get snapshot ID * @devlink: devlink instance * @id: storage to return snapshot id * * Allocates a new snapshot id. Returns zero on success, or a negative * error on failure. Must be called while holding the devlink instance * lock. * * Snapshot IDs are tracked using an xarray which stores the number of * users of the snapshot id. * * Note that the caller of this function counts as a 'user', in order to * avoid race conditions. The caller must release its hold on the * snapshot by using devlink_region_snapshot_id_put. */ static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) { return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1), xa_limit_32b, GFP_KERNEL); } /** * __devlink_region_snapshot_create - create a new snapshot * This will add a new snapshot of a region. The snapshot * will be stored on the region struct and can be accessed * from devlink. This is useful for future analyses of snapshots. * Multiple snapshots can be created on a region. * The @snapshot_id should be obtained using the getter function. * * Must be called only while holding the region snapshot lock. * * @region: devlink region of the snapshot * @data: snapshot data * @snapshot_id: snapshot id to be created */ static int __devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id) { struct devlink *devlink = region->devlink; struct devlink_snapshot *snapshot; int err; lockdep_assert_held(&region->snapshot_lock); /* check if region can hold one more snapshot */ if (region->cur_snapshots == region->max_snapshots) return -ENOSPC; if (devlink_region_snapshot_get_by_id(region, snapshot_id)) return -EEXIST; snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); if (!snapshot) return -ENOMEM; err = __devlink_snapshot_id_increment(devlink, snapshot_id); if (err) goto err_snapshot_id_increment; snapshot->id = snapshot_id; snapshot->region = region; snapshot->data = data; list_add_tail(&snapshot->list, &region->snapshot_list); region->cur_snapshots++; devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); return 0; err_snapshot_id_increment: kfree(snapshot); return err; } static void devlink_region_snapshot_del(struct devlink_region *region, struct devlink_snapshot *snapshot) { struct devlink *devlink = region->devlink; lockdep_assert_held(&region->snapshot_lock); devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); region->cur_snapshots--; list_del(&snapshot->list); region->ops->destructor(snapshot->data); __devlink_snapshot_id_decrement(devlink, snapshot->id); kfree(snapshot); } int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_port *port = NULL; struct devlink_region *region; const char *region_name; struct sk_buff *msg; unsigned int index; int err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) return -EINVAL; if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) return -ENODEV; } region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, info->snd_portid, info->snd_seq, 0, region); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, struct netlink_callback *cb, struct devlink_port *port, int *idx, int start, int flags) { struct devlink_region *region; int err = 0; list_for_each_entry(region, &port->region_list, list) { if (*idx < start) { (*idx)++; continue; } err = devlink_nl_region_fill(msg, port->devlink, DEVLINK_CMD_REGION_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, region); if (err) goto out; (*idx)++; } out: return err; } static int devlink_nl_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_region *region; struct devlink_port *port; unsigned long port_index; int idx = 0; int err; list_for_each_entry(region, &devlink->region_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, region); if (err) { state->idx = idx; return err; } idx++; } xa_for_each(&devlink->ports, port_index, port) { err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx, state->idx, flags); if (err) { state->idx = idx; return err; } } return 0; } int devlink_nl_region_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_region_get_dump_one); } int devlink_nl_region_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_snapshot *snapshot; struct devlink_port *port = NULL; struct devlink_region *region; const char *region_name; unsigned int index; u32 snapshot_id; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) || GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID)) return -EINVAL; region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) return -ENODEV; } if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) return -EINVAL; mutex_lock(&region->snapshot_lock); snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); if (!snapshot) { mutex_unlock(&region->snapshot_lock); return -EINVAL; } devlink_region_snapshot_del(region, snapshot); mutex_unlock(&region->snapshot_lock); return 0; } int devlink_nl_region_new_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_snapshot *snapshot; struct devlink_port *port = NULL; struct nlattr *snapshot_id_attr; struct devlink_region *region; const char *region_name; unsigned int index; u32 snapshot_id; u8 *data; int err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) { NL_SET_ERR_MSG(info->extack, "No region name provided"); return -EINVAL; } region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) return -ENODEV; } if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) { NL_SET_ERR_MSG(info->extack, "The requested region does not exist"); return -EINVAL; } if (!region->ops->snapshot) { NL_SET_ERR_MSG(info->extack, "The requested region does not support taking an immediate snapshot"); return -EOPNOTSUPP; } mutex_lock(&region->snapshot_lock); if (region->cur_snapshots == region->max_snapshots) { NL_SET_ERR_MSG(info->extack, "The region has reached the maximum number of stored snapshots"); err = -ENOSPC; goto unlock; } snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; if (snapshot_id_attr) { snapshot_id = nla_get_u32(snapshot_id_attr); if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { NL_SET_ERR_MSG(info->extack, "The requested snapshot id is already in use"); err = -EEXIST; goto unlock; } err = __devlink_snapshot_id_insert(devlink, snapshot_id); if (err) goto unlock; } else { err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); if (err) { NL_SET_ERR_MSG(info->extack, "Failed to allocate a new snapshot id"); goto unlock; } } if (port) err = region->port_ops->snapshot(port, region->port_ops, info->extack, &data); else err = region->ops->snapshot(devlink, region->ops, info->extack, &data); if (err) goto err_snapshot_capture; err = __devlink_region_snapshot_create(region, data, snapshot_id); if (err) goto err_snapshot_create; if (!snapshot_id_attr) { struct sk_buff *msg; snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); if (WARN_ON(!snapshot)) { err = -EINVAL; goto unlock; } msg = devlink_nl_region_notify_build(region, snapshot, DEVLINK_CMD_REGION_NEW, info->snd_portid, info->snd_seq); err = PTR_ERR_OR_ZERO(msg); if (err) goto err_notify; err = genlmsg_reply(msg, info); if (err) goto err_notify; } mutex_unlock(&region->snapshot_lock); return 0; err_snapshot_create: region->ops->destructor(data); err_snapshot_capture: __devlink_snapshot_id_decrement(devlink, snapshot_id); mutex_unlock(&region->snapshot_lock); return err; err_notify: devlink_region_snapshot_del(region, snapshot); unlock: mutex_unlock(&region->snapshot_lock); return err; } static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, u8 *chunk, u32 chunk_size, u64 addr) { struct nlattr *chunk_attr; int err; chunk_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_CHUNK); if (!chunk_attr) return -EINVAL; err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk); if (err) goto nla_put_failure; err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr); if (err) goto nla_put_failure; nla_nest_end(msg, chunk_attr); return 0; nla_put_failure: nla_nest_cancel(msg, chunk_attr); return err; } #define DEVLINK_REGION_READ_CHUNK_SIZE 256 typedef int devlink_chunk_fill_t(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack *extack); static int devlink_nl_region_read_fill(struct sk_buff *skb, devlink_chunk_fill_t *cb, void *cb_priv, u64 start_offset, u64 end_offset, u64 *new_offset, struct netlink_ext_ack *extack) { u64 curr_offset = start_offset; int err = 0; u8 *data; /* Allocate and re-use a single buffer */ data = kmalloc(DEVLINK_REGION_READ_CHUNK_SIZE, GFP_KERNEL); if (!data) return -ENOMEM; *new_offset = start_offset; while (curr_offset < end_offset) { u32 data_size; data_size = min_t(u32, end_offset - curr_offset, DEVLINK_REGION_READ_CHUNK_SIZE); err = cb(cb_priv, data, data_size, curr_offset, extack); if (err) break; err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset); if (err) break; curr_offset += data_size; } *new_offset = curr_offset; kfree(data); return err; } static int devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack __always_unused *extack) { struct devlink_snapshot *snapshot = cb_priv; memcpy(chunk, &snapshot->data[curr_offset], chunk_size); return 0; } static int devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack *extack) { struct devlink_region *region = cb_priv; return region->port_ops->read(region->port, region->port_ops, extack, curr_offset, chunk_size, chunk); } static int devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack *extack) { struct devlink_region *region = cb_priv; return region->ops->read(region->devlink, region->ops, extack, curr_offset, chunk_size, chunk); } int devlink_nl_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct nlattr *chunks_attr, *region_attr, *snapshot_attr; u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->info.attrs; struct devlink_port *port = NULL; devlink_chunk_fill_t *region_cb; struct devlink_region *region; const char *region_name; struct devlink *devlink; unsigned int index; void *region_cb_priv; void *hdr; int err; start_offset = state->start_offset; devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs, false); if (IS_ERR(devlink)) return PTR_ERR(devlink); if (!attrs[DEVLINK_ATTR_REGION_NAME]) { NL_SET_ERR_MSG(cb->extack, "No region name provided"); err = -EINVAL; goto out_unlock; } if (attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) { err = -ENODEV; goto out_unlock; } } region_attr = attrs[DEVLINK_ATTR_REGION_NAME]; region_name = nla_data(region_attr); if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) { NL_SET_ERR_MSG_ATTR(cb->extack, region_attr, "Requested region does not exist"); err = -EINVAL; goto out_unlock; } snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; if (!snapshot_attr) { if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); err = -EINVAL; goto out_unlock; } if (!region->ops->read) { NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read"); err = -EOPNOTSUPP; goto out_unlock; } if (port) region_cb = &devlink_region_port_direct_fill; else region_cb = &devlink_region_direct_fill; region_cb_priv = region; } else { struct devlink_snapshot *snapshot; u32 snapshot_id; if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot"); err = -EINVAL; goto out_unlock; } snapshot_id = nla_get_u32(snapshot_attr); snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); if (!snapshot) { NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); err = -EINVAL; goto out_unlock; } region_cb = &devlink_region_snapshot_fill; region_cb_priv = snapshot; } if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { if (!start_offset) start_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); } if (end_offset > region->size) end_offset = region->size; /* return 0 if there is no further data to read */ if (start_offset == end_offset) { err = 0; goto out_unlock; } hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, DEVLINK_CMD_REGION_READ); if (!hdr) { err = -EMSGSIZE; goto out_unlock; } err = devlink_nl_put_handle(skb, devlink); if (err) goto nla_put_failure; if (region->port) { err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, region->port->index); if (err) goto nla_put_failure; } err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); if (err) goto nla_put_failure; chunks_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_REGION_CHUNKS); if (!chunks_attr) { err = -EMSGSIZE; goto nla_put_failure; } err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv, start_offset, end_offset, &ret_offset, cb->extack); if (err && err != -EMSGSIZE) goto nla_put_failure; /* Check if there was any progress done to prevent infinite loop */ if (ret_offset == start_offset) { err = -EINVAL; goto nla_put_failure; } state->start_offset = ret_offset; nla_nest_end(skb, chunks_attr); genlmsg_end(skb, hdr); devl_unlock(devlink); devlink_put(devlink); return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); out_unlock: devl_unlock(devlink); devlink_put(devlink); return err; } /** * devl_region_create - create a new address region * * @devlink: devlink * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region */ struct devlink_region *devl_region_create(struct devlink *devlink, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size) { struct devlink_region *region; devl_assert_locked(devlink); if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) return ERR_PTR(-EINVAL); if (devlink_region_get_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); region = kzalloc(sizeof(*region), GFP_KERNEL); if (!region) return ERR_PTR(-ENOMEM); region->devlink = devlink; region->max_snapshots = region_max_snapshots; region->ops = ops; region->size = region_size; INIT_LIST_HEAD(&region->snapshot_list); mutex_init(&region->snapshot_lock); list_add_tail(&region->list, &devlink->region_list); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); return region; } EXPORT_SYMBOL_GPL(devl_region_create); /** * devlink_region_create - create a new address region * * @devlink: devlink * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region * * Context: Takes and release devlink->lock <mutex>. */ struct devlink_region * devlink_region_create(struct devlink *devlink, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size) { struct devlink_region *region; devl_lock(devlink); region = devl_region_create(devlink, ops, region_max_snapshots, region_size); devl_unlock(devlink); return region; } EXPORT_SYMBOL_GPL(devlink_region_create); /** * devlink_port_region_create - create a new address region for a port * * @port: devlink port * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region * * Context: Takes and release devlink->lock <mutex>. */ struct devlink_region * devlink_port_region_create(struct devlink_port *port, const struct devlink_port_region_ops *ops, u32 region_max_snapshots, u64 region_size) { struct devlink *devlink = port->devlink; struct devlink_region *region; int err = 0; ASSERT_DEVLINK_PORT_INITIALIZED(port); if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) return ERR_PTR(-EINVAL); devl_lock(devlink); if (devlink_port_region_get_by_name(port, ops->name)) { err = -EEXIST; goto unlock; } region = kzalloc(sizeof(*region), GFP_KERNEL); if (!region) { err = -ENOMEM; goto unlock; } region->devlink = devlink; region->port = port; region->max_snapshots = region_max_snapshots; region->port_ops = ops; region->size = region_size; INIT_LIST_HEAD(&region->snapshot_list); mutex_init(&region->snapshot_lock); list_add_tail(&region->list, &port->region_list); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); devl_unlock(devlink); return region; unlock: devl_unlock(devlink); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(devlink_port_region_create); /** * devl_region_destroy - destroy address region * * @region: devlink region to destroy */ void devl_region_destroy(struct devlink_region *region) { struct devlink *devlink = region->devlink; struct devlink_snapshot *snapshot, *ts; devl_assert_locked(devlink); /* Free all snapshots of region */ mutex_lock(&region->snapshot_lock); list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list) devlink_region_snapshot_del(region, snapshot); mutex_unlock(&region->snapshot_lock); list_del(&region->list); mutex_destroy(&region->snapshot_lock); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); kfree(region); } EXPORT_SYMBOL_GPL(devl_region_destroy); /** * devlink_region_destroy - destroy address region * * @region: devlink region to destroy * * Context: Takes and release devlink->lock <mutex>. */ void devlink_region_destroy(struct devlink_region *region) { struct devlink *devlink = region->devlink; devl_lock(devlink); devl_region_destroy(region); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_region_destroy); /** * devlink_region_snapshot_id_get - get snapshot ID * * This callback should be called when adding a new snapshot, * Driver should use the same id for multiple snapshots taken * on multiple regions at the same time/by the same trigger. * * The caller of this function must use devlink_region_snapshot_id_put * when finished creating regions using this id. * * Returns zero on success, or a negative error code on failure. * * @devlink: devlink * @id: storage to return id */ int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) { return __devlink_region_snapshot_id_get(devlink, id); } EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); /** * devlink_region_snapshot_id_put - put snapshot ID reference * * This should be called by a driver after finishing creating snapshots * with an id. Doing so ensures that the ID can later be released in the * event that all snapshots using it have been destroyed. * * @devlink: devlink * @id: id to release reference on */ void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id) { __devlink_snapshot_id_decrement(devlink, id); } EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put); /** * devlink_region_snapshot_create - create a new snapshot * This will add a new snapshot of a region. The snapshot * will be stored on the region struct and can be accessed * from devlink. This is useful for future analyses of snapshots. * Multiple snapshots can be created on a region. * The @snapshot_id should be obtained using the getter function. * * @region: devlink region of the snapshot * @data: snapshot data * @snapshot_id: snapshot id to be created */ int devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id) { int err; mutex_lock(&region->snapshot_lock); err = __devlink_region_snapshot_create(region, data, snapshot_id); mutex_unlock(&region->snapshot_lock); return err; } EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
1 1 4 2 2 2 4 2 2 2 2 2 1 1 2 2 2 3 2 2 3 1 1 1 2 1 1 1 4 1 1 2 5 5 2 2 1 3 2 1 1 2 4 1 1 2 5 2 1 1 1 3 2 1 6 6 2 2 2 2 2 2 1 1 1 3 1 1 1 1 4 2 2 2 2 1 1 1 1 4 2 1 1 2 2 6 2 1 3 4 2 2 2 2 2 2 2 2 2 1 1 2 1 1 369 71 301 303 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * * Vendor commands implementation based on net/wireless/nl80211.c * which is: * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <net/genetlink.h> #include <linux/nfc.h> #include <linux/slab.h> #include "nfc.h" #include "llcp.h" static const struct genl_multicast_group nfc_genl_mcgrps[] = { { .name = NFC_GENL_MCAST_EVENT_NAME, }, }; static struct genl_family nfc_genl_family; static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING, .len = NFC_DEVICE_NAME_MAXSIZE }, [NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TARGET_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_COMM_MODE] = { .type = NLA_U8 }, [NFC_ATTR_RF_MODE] = { .type = NLA_U8 }, [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 }, [NFC_ATTR_IM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_LLC_PARAM_LTO] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 }, [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, }; static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { [NFC_SDP_ATTR_URI] = { .type = NLA_STRING, .len = U8_MAX - 4 }, [NFC_SDP_ATTR_SAP] = { .type = NLA_U8 }, }; static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &nfc_genl_family, flags, NFC_CMD_GET_TARGET); if (!hdr) return -EMSGSIZE; genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) || nla_put_u16(msg, NFC_ATTR_TARGET_SENS_RES, target->sens_res) || nla_put_u8(msg, NFC_ATTR_TARGET_SEL_RES, target->sel_res)) goto nla_put_failure; if (target->nfcid1_len > 0 && nla_put(msg, NFC_ATTR_TARGET_NFCID1, target->nfcid1_len, target->nfcid1)) goto nla_put_failure; if (target->sensb_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSB_RES, target->sensb_res_len, target->sensb_res)) goto nla_put_failure; if (target->sensf_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSF_RES, target->sensf_res_len, target->sensf_res)) goto nla_put_failure; if (target->is_iso15693) { if (nla_put_u8(msg, NFC_ATTR_TARGET_ISO15693_DSFID, target->iso15693_dsfid) || nla_put(msg, NFC_ATTR_TARGET_ISO15693_UID, sizeof(target->iso15693_uid), target->iso15693_uid)) goto nla_put_failure; } if (target->ats_len > 0 && nla_put(msg, NFC_ATTR_TARGET_ATS, target->ats_len, target->ats)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nfc_dev *dev; u32 idx; if (!info->info.attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); idx = nla_get_u32(info->info.attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return ERR_PTR(-ENODEV); return dev; } static int nfc_genl_dump_targets(struct sk_buff *skb, struct netlink_callback *cb) { int i = cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; int rc; if (!dev) { dev = __get_device_from_cb(cb); if (IS_ERR(dev)) return PTR_ERR(dev); cb->args[1] = (long) dev; } device_lock(&dev->dev); cb->seq = dev->targets_generation; while (i < dev->n_targets) { rc = nfc_genl_send_target(skb, &dev->targets[i], cb, NLM_F_MULTI); if (rc < 0) break; i++; } device_unlock(&dev->dev); cb->args[0] = i; return skb->len; } static int nfc_genl_dump_targets_done(struct netlink_callback *cb) { struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; if (dev) nfc_put_device(dev); return 0; } int nfc_genl_targets_found(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; dev->genl_data.poll_req_portid = 0; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGETS_FOUND); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGET_LOST); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_ACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(msg, NFC_ATTR_TM_PROTOCOLS, protocol)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_deactivated(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_DEACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_setup_device_added(struct nfc_dev *dev, struct sk_buff *msg) { if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) || nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) || nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode)) return -1; return 0; } int nfc_genl_device_added(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_ADDED); if (!hdr) goto free_msg; if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_device_removed(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list) { struct sk_buff *msg; struct nlattr *sdp_attr, *uri_attr; struct nfc_llcp_sdp_tlv *sdres; struct hlist_node *n; void *hdr; int rc = -EMSGSIZE; int i; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_LLC_SDRES); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; sdp_attr = nla_nest_start_noflag(msg, NFC_ATTR_LLC_SDP); if (sdp_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } i = 1; hlist_for_each_entry_safe(sdres, n, sdres_list, node) { pr_debug("uri: %s, sap: %d\n", sdres->uri, sdres->sap); uri_attr = nla_nest_start_noflag(msg, i++); if (uri_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } if (nla_put_u8(msg, NFC_SDP_ATTR_SAP, sdres->sap)) goto nla_put_failure; if (nla_put_string(msg, NFC_SDP_ATTR_URI, sdres->uri)) goto nla_put_failure; nla_nest_end(msg, uri_attr); hlist_del(&sdres->node); nfc_llcp_free_sdp_tlv(sdres); } nla_nest_end(msg, sdp_attr); genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); nfc_llcp_free_sdp_tlv_list(sdres_list); return rc; } int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_ADDED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction) { struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_TRANSACTION); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type) || nla_put(msg, NFC_ATTR_SE_AID, evt_transaction->aid_len, evt_transaction->aid) || nla_put(msg, NFC_ATTR_SE_PARAMS, evt_transaction->params_len, evt_transaction->params)) goto nla_put_failure; /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) { const struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_CONNECTIVITY); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_DEVICE); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_device(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_devices_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is up\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_UP); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (rf_mode == NFC_RF_INITIATOR && nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; if (nla_put_u8(msg, NFC_ATTR_COMM_MODE, comm_mode) || nla_put_u8(msg, NFC_ATTR_RF_MODE, rf_mode)) goto nla_put_failure; genlmsg_end(msg, hdr); dev->dep_link_up = true; genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_dep_link_down_event(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is down\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_DOWN); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_get_device(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct nfc_dev *dev; u32 idx; int rc = -ENOBUFS; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto out_putdev; } rc = nfc_genl_send_device(msg, dev, info->snd_portid, info->snd_seq, NULL, 0); if (rc < 0) goto out_free; nfc_put_device(dev); return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); out_putdev: nfc_put_device(dev); return rc; } static int nfc_genl_dev_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_up(dev); nfc_put_device(dev); return rc; } static int nfc_genl_dev_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_start_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; u32 im_protocols = 0, tm_protocols = 0; pr_debug("Poll start\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || ((!info->attrs[NFC_ATTR_IM_PROTOCOLS] && !info->attrs[NFC_ATTR_PROTOCOLS]) && !info->attrs[NFC_ATTR_TM_PROTOCOLS])) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (info->attrs[NFC_ATTR_TM_PROTOCOLS]) tm_protocols = nla_get_u32(info->attrs[NFC_ATTR_TM_PROTOCOLS]); if (info->attrs[NFC_ATTR_IM_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_IM_PROTOCOLS]); else if (info->attrs[NFC_ATTR_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; mutex_lock(&dev->genl_data.genl_data_mutex); rc = nfc_start_poll(dev, im_protocols, tm_protocols); if (!rc) dev->genl_data.poll_req_portid = info->snd_portid; mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (!dev->polling) { device_unlock(&dev->dev); nfc_put_device(dev); return -EINVAL; } device_unlock(&dev->dev); mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid != info->snd_portid) { rc = -EBUSY; goto out; } rc = nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; out: mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx, protocol; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX] || !info->attrs[NFC_ATTR_PROTOCOLS]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); rc = nfc_activate_target(dev, target_idx, protocol); nfc_put_device(dev); return rc; } static int nfc_genl_deactivate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc, tgt_idx; u32 idx; u8 comm; pr_debug("DEP link up\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_COMM_MODE]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (!info->attrs[NFC_ATTR_TARGET_INDEX]) tgt_idx = NFC_TARGET_IDX_ANY; else tgt_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); comm = nla_get_u8(info->attrs[NFC_ATTR_COMM_MODE]); if (comm != NFC_COMM_ACTIVE && comm != NFC_COMM_PASSIVE) return -EINVAL; dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_up(dev, tgt_idx, comm); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_send_params(struct sk_buff *msg, struct nfc_llcp_local *local, u32 portid, u32 seq) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, 0, NFC_CMD_LLC_GET_PARAMS); if (!hdr) return -EMSGSIZE; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, local->dev->idx) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_LTO, local->lto) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_RW, local->rw) || nla_put_u16(msg, NFC_ATTR_LLC_PARAM_MIUX, be16_to_cpu(local->miux))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; int rc = 0; struct sk_buff *msg = NULL; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto put_local; } rc = nfc_genl_send_params(msg, local, info->snd_portid, info->snd_seq); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); if (rc < 0) { if (msg) nlmsg_free(msg); return rc; } return genlmsg_reply(msg, info); } static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; u8 rw = 0; u16 miux = 0; u32 idx; int rc = 0; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || (!info->attrs[NFC_ATTR_LLC_PARAM_LTO] && !info->attrs[NFC_ATTR_LLC_PARAM_RW] && !info->attrs[NFC_ATTR_LLC_PARAM_MIUX])) return -EINVAL; if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) { rw = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_RW]); if (rw > LLCP_MAX_RW) return -EINVAL; } if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) { miux = nla_get_u16(info->attrs[NFC_ATTR_LLC_PARAM_MIUX]); if (miux > LLCP_MAX_MIUX) return -EINVAL; } idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } if (info->attrs[NFC_ATTR_LLC_PARAM_LTO]) { if (dev->dep_link_up) { rc = -EINPROGRESS; goto put_local; } local->lto = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_LTO]); } if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) local->rw = rw; if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) local->miux = cpu_to_be16(miux); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; struct nlattr *attr, *sdp_attrs[NFC_SDP_ATTR_MAX+1]; u32 idx; u8 tid; char *uri; int rc = 0, rem; size_t uri_len, tlvs_len; struct hlist_head sdreq_list; struct nfc_llcp_sdp_tlv *sdreq; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_LLC_SDP]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (dev->dep_link_up == false) { rc = -ENOLINK; goto exit; } local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } INIT_HLIST_HEAD(&sdreq_list); tlvs_len = 0; nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) { rc = nla_parse_nested_deprecated(sdp_attrs, NFC_SDP_ATTR_MAX, attr, nfc_sdp_genl_policy, info->extack); if (rc != 0) { rc = -EINVAL; goto put_local; } if (!sdp_attrs[NFC_SDP_ATTR_URI]) continue; uri_len = nla_len(sdp_attrs[NFC_SDP_ATTR_URI]); if (uri_len == 0) continue; uri = nla_data(sdp_attrs[NFC_SDP_ATTR_URI]); if (uri == NULL || *uri == 0) continue; tid = local->sdreq_next_tid++; sdreq = nfc_llcp_build_sdreq_tlv(tid, uri, uri_len); if (sdreq == NULL) { rc = -ENOMEM; goto put_local; } tlvs_len += sdreq->tlv_len; hlist_add_head(&sdreq->node, &sdreq_list); } if (hlist_empty(&sdreq_list)) { rc = -EINVAL; goto put_local; } rc = nfc_llcp_send_snl_sdreq(local, &sdreq_list, tlvs_len); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1]; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], sizeof(firmware_name)); rc = nfc_fw_download(dev, firmware_name); nfc_put_device(dev); return rc; } int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_FW_DOWNLOAD); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) || nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_enable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_enable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_disable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; struct nfc_se *se, *n; list_for_each_entry_safe(se, n, &dev->secure_elements, list) { hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_SE); if (!hdr) goto nla_put_failure; if (cb) genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); } return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_ses(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_ses_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } static int nfc_se_io(struct nfc_dev *dev, u32 se_idx, u8 *apdu, size_t apdu_length, se_io_cb_t cb, void *cb_context) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (!device_is_registered(&dev->dev)) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (!dev->ops->se_io) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state != NFC_SE_ENABLED) { rc = -ENODEV; goto error; } rc = dev->ops->se_io(dev, se_idx, apdu, apdu_length, cb, cb_context); device_unlock(&dev->dev); return rc; error: device_unlock(&dev->dev); kfree(cb_context); return rc; } struct se_io_ctx { u32 dev_idx; u32 se_idx; }; static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err) { struct se_io_ctx *ctx = context; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { kfree(ctx); return; } hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_SE_IO); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, ctx->dev_idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, ctx->se_idx) || nla_put(msg, NFC_ATTR_SE_APDU, apdu_len, apdu)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); kfree(ctx); return; nla_put_failure: free_msg: nlmsg_free(msg); kfree(ctx); return; } static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct se_io_ctx *ctx; u32 dev_idx, se_idx; u8 *apdu; size_t apdu_len; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX] || !info->attrs[NFC_ATTR_SE_APDU]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->ops || !dev->ops->se_io) { rc = -EOPNOTSUPP; goto put_dev; } apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]); if (apdu_len == 0) { rc = -EINVAL; goto put_dev; } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); if (!apdu) { rc = -EINVAL; goto put_dev; } ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); if (!ctx) { rc = -ENOMEM; goto put_dev; } ctx->dev_idx = dev_idx; ctx->se_idx = se_idx; rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); put_dev: nfc_put_device(dev); return rc; } static int nfc_genl_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; const struct nfc_vendor_cmd *cmd; u32 dev_idx, vid, subcmd; u8 *data; size_t data_len; int i, err; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_VENDOR_ID] || !info->attrs[NFC_ATTR_VENDOR_SUBCMD]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]); subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->vendor_cmds || !dev->n_vendor_cmds) { err = -ENODEV; goto put_dev; } if (info->attrs[NFC_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); if (data_len == 0) { err = -EINVAL; goto put_dev; } } else { data = NULL; data_len = 0; } for (i = 0; i < dev->n_vendor_cmds; i++) { cmd = &dev->vendor_cmds[i]; if (cmd->vendor_id != vid || cmd->subcmd != subcmd) continue; dev->cur_cmd_info = info; err = cmd->doit(dev, data, data_len); dev->cur_cmd_info = NULL; goto put_dev; } err = -EOPNOTSUPP; put_dev: nfc_put_device(dev); return err; } /* message building helper */ static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) { /* since there is no private header just add the generic one */ return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd); } static struct sk_buff * __nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen, u32 portid, u32 seq, enum nfc_attrs attr, u32 oui, u32 subcmd, gfp_t gfp) { struct sk_buff *skb; void *hdr; skb = nlmsg_new(approxlen + 100, gfp); if (!skb) return NULL; hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR); if (!hdr) { kfree_skb(skb); return NULL; } if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd)) goto nla_put_failure; ((void **)skb->cb)[0] = dev; ((void **)skb->cb)[1] = hdr; return skb; nla_put_failure: kfree_skb(skb); return NULL; } struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, enum nfc_attrs attr, u32 oui, u32 subcmd, int approxlen) { if (WARN_ON(!dev->cur_cmd_info)) return NULL; return __nfc_alloc_vendor_cmd_skb(dev, approxlen, dev->cur_cmd_info->snd_portid, dev->cur_cmd_info->snd_seq, attr, oui, subcmd, GFP_KERNEL); } EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb); int nfc_vendor_cmd_reply(struct sk_buff *skb) { struct nfc_dev *dev = ((void **)skb->cb)[0]; void *hdr = ((void **)skb->cb)[1]; /* clear CB data for netlink core to own from now on */ memset(skb->cb, 0, sizeof(skb->cb)); if (WARN_ON(!dev->cur_cmd_info)) { kfree_skb(skb); return -EINVAL; } genlmsg_end(skb, hdr); return genlmsg_reply(skb, dev->cur_cmd_info); } EXPORT_SYMBOL(nfc_vendor_cmd_reply); static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_get_device, .dumpit = nfc_genl_dump_devices, .done = nfc_genl_dump_devices_done, }, { .cmd = NFC_CMD_DEV_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEV_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_START_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_start_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_STOP_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_stop_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, }, { .cmd = NFC_CMD_LLC_GET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_get_params, }, { .cmd = NFC_CMD_LLC_SET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_set_params, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_LLC_SDREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_sdreq, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_FW_DOWNLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_fw_download, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ENABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_enable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DISABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_disable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = nfc_genl_dump_ses, .done = nfc_genl_dump_ses_done, }, { .cmd = NFC_CMD_SE_IO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_se_io, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_activate_target, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_VENDOR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_vendor_cmd, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_deactivate_target, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family nfc_genl_family __ro_after_init = { .hdrsize = 0, .name = NFC_GENL_NAME, .version = NFC_GENL_VERSION, .maxattr = NFC_ATTR_MAX, .policy = nfc_genl_policy, .module = THIS_MODULE, .ops = nfc_genl_ops, .n_ops = ARRAY_SIZE(nfc_genl_ops), .resv_start_op = NFC_CMD_DEACTIVATE_TARGET + 1, .mcgrps = nfc_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps), }; struct urelease_work { struct work_struct w; u32 portid; }; static void nfc_urelease_event_work(struct work_struct *work) { struct urelease_work *w = container_of(work, struct urelease_work, w); struct class_dev_iter iter; struct nfc_dev *dev; pr_debug("portid %d\n", w->portid); mutex_lock(&nfc_devlist_mutex); nfc_device_iter_init(&iter); dev = nfc_device_iter_next(&iter); while (dev) { mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid == w->portid) { nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; } mutex_unlock(&dev->genl_data.genl_data_mutex); dev = nfc_device_iter_next(&iter); } nfc_device_iter_exit(&iter); mutex_unlock(&nfc_devlist_mutex); kfree(w); } static int nfc_genl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct urelease_work *w; if (event != NETLINK_URELEASE || n->protocol != NETLINK_GENERIC) goto out; pr_debug("NETLINK_URELEASE event from id %d\n", n->portid); w = kmalloc(sizeof(*w), GFP_ATOMIC); if (w) { INIT_WORK(&w->w, nfc_urelease_event_work); w->portid = n->portid; schedule_work(&w->w); } out: return NOTIFY_DONE; } void nfc_genl_data_init(struct nfc_genl_data *genl_data) { genl_data->poll_req_portid = 0; mutex_init(&genl_data->genl_data_mutex); } void nfc_genl_data_exit(struct nfc_genl_data *genl_data) { mutex_destroy(&genl_data->genl_data_mutex); } static struct notifier_block nl_notifier = { .notifier_call = nfc_genl_rcv_nl_event, }; /** * nfc_genl_init() - Initialize netlink interface * * This initialization function registers the nfc netlink family. */ int __init nfc_genl_init(void) { int rc; rc = genl_register_family(&nfc_genl_family); if (rc) return rc; netlink_register_notifier(&nl_notifier); return 0; } /** * nfc_genl_exit() - Deinitialize netlink interface * * This exit function unregisters the nfc netlink family. */ void nfc_genl_exit(void) { netlink_unregister_notifier(&nl_notifier); genl_unregister_family(&nfc_genl_family); }
97 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_SEQADJ_H #define _NF_CONNTRACK_SEQADJ_H #include <net/netfilter/nf_conntrack_extend.h> /** * struct nf_ct_seqadj - sequence number adjustment information * * @correction_pos: position of the last TCP sequence number modification * @offset_before: sequence number offset before last modification * @offset_after: sequence number offset after last modification */ struct nf_ct_seqadj { u32 correction_pos; s32 offset_before; s32 offset_after; }; struct nf_conn_seqadj { struct nf_ct_seqadj seq[IP_CT_DIR_MAX]; }; static inline struct nf_conn_seqadj *nfct_seqadj(const struct nf_conn *ct) { return nf_ct_ext_find(ct, NF_CT_EXT_SEQADJ); } static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct) { return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC); } int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off); void nf_ct_tcp_seqadj_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir, u32 seq); #endif /* _NF_CONNTRACK_SEQADJ_H */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 // SPDX-License-Identifier: GPL-2.0-only /* xfrm4_tunnel.c: Generic IP tunnel transformer. * * Copyright (C) 2003 David S. Miller (davem@redhat.com) */ #define pr_fmt(fmt) "IPsec: " fmt #include <linux/skbuff.h> #include <linux/module.h> #include <net/xfrm.h> #include <net/protocol.h> static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); return 0; } static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb) { return ip_hdr(skb)->protocol; } static int ipip_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->props.mode != XFRM_MODE_TUNNEL) { NL_SET_ERR_MSG(extack, "IPv4 tunnel can only be used with tunnel mode"); return -EINVAL; } if (x->encap) { NL_SET_ERR_MSG(extack, "IPv4 tunnel is not compatible with encapsulation"); return -EINVAL; } x->props.header_len = sizeof(struct iphdr); return 0; } static void ipip_destroy(struct xfrm_state *x) { } static const struct xfrm_type ipip_type = { .owner = THIS_MODULE, .proto = IPPROTO_IPIP, .init_state = ipip_init_state, .destructor = ipip_destroy, .input = ipip_xfrm_rcv, .output = ipip_output }; static int xfrm_tunnel_rcv(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); } static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) { return -ENOENT; } static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 4, }; #if IS_ENABLED(CONFIG_IPV6) static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 3, }; #endif static int __init ipip_init(void) { if (xfrm_register_type(&ipip_type, AF_INET) < 0) { pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) { pr_info("%s: can't add xfrm handler for AF_INET\n", __func__); xfrm_unregister_type(&ipip_type, AF_INET); return -EAGAIN; } #if IS_ENABLED(CONFIG_IPV6) if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) { pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__); xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET); xfrm_unregister_type(&ipip_type, AF_INET); return -EAGAIN; } #endif return 0; } static void __exit ipip_fini(void) { #if IS_ENABLED(CONFIG_IPV6) if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6)) pr_info("%s: can't remove xfrm handler for AF_INET6\n", __func__); #endif if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET)) pr_info("%s: can't remove xfrm handler for AF_INET\n", __func__); xfrm_unregister_type(&ipip_type, AF_INET); } module_init(ipip_init); module_exit(ipip_fini); MODULE_DESCRIPTION("IPv4 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP);
12 1 2 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "IPsec: " fmt #include <crypto/hash.h> #include <crypto/utils.h> #include <linux/err.h> #include <linux/module.h> #include <linux/slab.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ah.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> #include <linux/scatterlist.h> #include <net/icmp.h> #include <net/protocol.h> struct ah_skb_cb { struct xfrm_skb_cb xfrm; void *tmp; }; #define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, unsigned int size) { unsigned int len; len = size + crypto_ahash_digestsize(ahash); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); len = ALIGN(len, __alignof__(struct scatterlist)); len += sizeof(struct scatterlist) * nfrags; return kmalloc(len, GFP_ATOMIC); } static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset) { return tmp + offset; } static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset) { return tmp + offset; } static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, u8 *icv) { struct ahash_request *req; req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), crypto_tfm_ctx_alignment()); ahash_request_set_tfm(req, ahash); return req; } static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, struct ahash_request *req) { return (void *)ALIGN((unsigned long)(req + 1) + crypto_ahash_reqsize(ahash), __alignof__(struct scatterlist)); } /* Clear mutable options and find final destination to substitute * into IP header for icv calculation. Options are already checked * for validity, so paranoia is not required. */ static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) { unsigned char *optptr = (unsigned char *)(iph+1); int l = iph->ihl*4 - sizeof(struct iphdr); int optlen; while (l > 0) { switch (*optptr) { case IPOPT_END: return 0; case IPOPT_NOOP: l--; optptr++; continue; } optlen = optptr[1]; if (optlen<2 || optlen>l) return -EINVAL; switch (*optptr) { case IPOPT_SEC: case 0x85: /* Some "Extended Security" crap. */ case IPOPT_CIPSO: case IPOPT_RA: case 0x80|21: /* RFC1770 */ break; case IPOPT_LSRR: case IPOPT_SSRR: if (optlen < 6) return -EINVAL; memcpy(daddr, optptr+optlen-4, 4); fallthrough; default: memset(optptr, 0, optlen); } l -= optlen; optptr += optlen; } return 0; } static void ah_output_done(void *data, int err) { u8 *icv; struct iphdr *iph; struct sk_buff *skb = data; struct xfrm_state *x = skb_dst(skb)->xfrm; struct ah_data *ahp = x->data; struct iphdr *top_iph = ip_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); iph = AH_SKB_CB(skb)->tmp; icv = ah_tmp_icv(iph, ihl); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; top_iph->frag_off = iph->frag_off; if (top_iph->ihl != 5) { top_iph->daddr = iph->daddr; memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb->sk, skb, err); } static int ah_output(struct xfrm_state *x, struct sk_buff *skb) { int err; int nfrags; int ihl; u8 *icv; struct sk_buff *trailer; struct crypto_ahash *ahash; struct ahash_request *req; struct scatterlist *sg; struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; int seqhi_len = 0; __be32 *seqhi; int sglists = 0; struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; if ((err = skb_cow_data(skb, 0, &trailer)) < 0) goto out; nfrags = err; skb_push(skb, -skb_network_offset(skb)); ah = ip_auth_hdr(skb); ihl = ip_hdrlen(skb); if (x->props.flags & XFRM_STATE_ESN) { sglists = 1; seqhi_len = sizeof(*seqhi); } err = -ENOMEM; iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len); if (!iph) goto out; seqhi = (__be32 *)((char *)iph + ihl); icv = ah_tmp_icv(seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; memset(ah->auth_data, 0, ahp->icv_trunc_len); top_iph = ip_hdr(skb); iph->tos = top_iph->tos; iph->ttl = top_iph->ttl; iph->frag_off = top_iph->frag_off; if (top_iph->ihl != 5) { iph->daddr = top_iph->daddr; memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); err = ip_clear_mutable_options(top_iph, &top_iph->daddr); if (err) goto out_free; } ah->nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_AH; top_iph->tos = 0; top_iph->tot_len = htons(skb->len); top_iph->frag_off = 0; top_iph->ttl = 0; top_iph->check = 0; if (x->props.flags & XFRM_STATE_ALIGN4) ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; else ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags + sglists); err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); if (unlikely(err < 0)) goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); sg_set_buf(seqhisg, seqhi, seqhi_len); } ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_output_done, skb); AH_SKB_CB(skb)->tmp = iph; err = crypto_ahash_digest(req); if (err) { if (err == -EINPROGRESS) goto out; if (err == -ENOSPC) err = NET_XMIT_DROP; goto out_free; } memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; top_iph->frag_off = iph->frag_off; if (top_iph->ihl != 5) { top_iph->daddr = iph->daddr; memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } out_free: kfree(iph); out: return err; } static void ah_input_done(void *data, int err) { u8 *auth_data; u8 *icv; struct iphdr *work_iph; struct sk_buff *skb = data; struct xfrm_state *x = xfrm_input_state(skb); struct ah_data *ahp = x->data; struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); int ah_hlen = (ah->hdrlen + 2) << 2; if (err) goto out; work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, ihl); icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out; err = ah->nexthdr; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); out: kfree(AH_SKB_CB(skb)->tmp); xfrm_input_resume(skb, err); } static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; int ihl; int nexthdr; int nfrags; u8 *auth_data; u8 *icv; struct sk_buff *trailer; struct crypto_ahash *ahash; struct ahash_request *req; struct scatterlist *sg; struct iphdr *iph, *work_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; int err = -ENOMEM; int seqhi_len = 0; __be32 *seqhi; int sglists = 0; struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(*ah))) goto out; ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; ahash = ahp->ahash; nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; if (x->props.flags & XFRM_STATE_ALIGN4) { if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) goto out; } else { if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) goto out; } if (!pskb_may_pull(skb, ah_hlen)) goto out; /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; if ((err = skb_cow_data(skb, 0, &trailer)) < 0) goto out; nfrags = err; ah = (struct ip_auth_hdr *)skb->data; iph = ip_hdr(skb); ihl = ip_hdrlen(skb); if (x->props.flags & XFRM_STATE_ESN) { sglists = 1; seqhi_len = sizeof(*seqhi); } work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + ahp->icv_trunc_len + seqhi_len); if (!work_iph) { err = -ENOMEM; goto out; } seqhi = (__be32 *)((char *)work_iph + ihl); auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; memcpy(work_iph, iph, ihl); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); iph->ttl = 0; iph->tos = 0; iph->frag_off = 0; iph->check = 0; if (ihl > sizeof(*iph)) { __be32 dummy; err = ip_clear_mutable_options(iph, &dummy); if (err) goto out_free; } skb_push(skb, ihl); sg_init_table(sg, nfrags + sglists); err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); if (unlikely(err < 0)) goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; sg_set_buf(seqhisg, seqhi, seqhi_len); } ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; err = crypto_ahash_digest(req); if (err) { if (err == -EINPROGRESS) goto out; goto out_free; } err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out_free; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); err = nexthdr; out_free: kfree (work_iph); out: return err; } static int ah4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH); else ipv4_redirect(skb, net, 0, IPPROTO_AH); xfrm_state_put(x); return 0; } static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; struct crypto_ahash *ahash; if (!x->aalg) { NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm"); goto error; } if (x->encap) { NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation"); goto error; } ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); if (!ahp) return -ENOMEM; ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); if (IS_ERR(ahash)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } ahp->ahash = ahash; if (crypto_ahash_setkey(ahash, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } /* * Lookup the algorithm description maintained by xfrm_algo, * verify crypto transform properties, and store information * we need for AH processing. This lookup cannot fail here * after a successful crypto_alloc_ahash(). */ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); BUG_ON(!aalg_desc); if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; if (x->props.flags & XFRM_STATE_ALIGN4) x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); else x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); x->data = ahp; return 0; error: if (ahp) { crypto_free_ahash(ahp->ahash); kfree(ahp); } return -EINVAL; } static void ah_destroy(struct xfrm_state *x) { struct ah_data *ahp = x->data; if (!ahp) return; crypto_free_ahash(ahp->ahash); kfree(ahp); } static int ah4_rcv_cb(struct sk_buff *skb, int err) { return 0; } static const struct xfrm_type ah_type = { .owner = THIS_MODULE, .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, .init_state = ah_init_state, .destructor = ah_destroy, .input = ah_input, .output = ah_output }; static struct xfrm4_protocol ah4_protocol = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = ah4_rcv_cb, .err_handler = ah4_err, .priority = 0, }; static int __init ah4_init(void) { if (xfrm_register_type(&ah_type, AF_INET) < 0) { pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); return -EAGAIN; } return 0; } static void __exit ah4_fini(void) { if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); } module_init(ah4_init); module_exit(ah4_fini); MODULE_DESCRIPTION("IPv4 AH transformation library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
44 44 44 44 48 12 12 49 1 43 3 8 3 8 3 43 3 3 3 43 8 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 // SPDX-License-Identifier: GPL-2.0-or-later /* Client connection-specific management code. * * Copyright (C) 2016, 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * Client connections need to be cached for a little while after they've made a * call so as to handle retransmitted DATA packets in case the server didn't * receive the final ACK or terminating ABORT we sent it. * * There are flags of relevance to the cache: * * (2) DONT_REUSE - The connection should be discarded as soon as possible and * should not be reused. This is set when an exclusive connection is used * or a call ID counter overflows. * * The caching state may only be changed if the cache lock is held. * * There are two idle client connection expiry durations. If the total number * of connections is below the reap threshold, we use the normal duration; if * it's above, we use the fast duration. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/idr.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include "ar-internal.h" __read_mostly unsigned int rxrpc_reap_client_connections = 900; __read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; static void rxrpc_activate_bundle(struct rxrpc_bundle *bundle) { atomic_inc(&bundle->active); } /* * Release a connection ID for a client connection. */ static void rxrpc_put_client_connection_id(struct rxrpc_local *local, struct rxrpc_connection *conn) { idr_remove(&local->conn_ids, conn->proto.cid >> RXRPC_CIDSHIFT); } /* * Destroy the client connection ID tree. */ static void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local) { struct rxrpc_connection *conn; int id; if (!idr_is_empty(&local->conn_ids)) { idr_for_each_entry(&local->conn_ids, conn, id) { pr_err("AF_RXRPC: Leaked client conn %p {%d}\n", conn, refcount_read(&conn->ref)); } BUG(); } idr_destroy(&local->conn_ids); } /* * Allocate a connection bundle. */ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, gfp_t gfp) { static atomic_t rxrpc_bundle_id; struct rxrpc_bundle *bundle; bundle = kzalloc(sizeof(*bundle), gfp); if (bundle) { bundle->local = call->local; bundle->peer = rxrpc_get_peer(call->peer, rxrpc_peer_get_bundle); bundle->key = key_get(call->key); bundle->security = call->security; bundle->exclusive = test_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); bundle->upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); bundle->service_id = call->dest_srx.srx_service; bundle->security_level = call->security_level; bundle->debug_id = atomic_inc_return(&rxrpc_bundle_id); refcount_set(&bundle->ref, 1); atomic_set(&bundle->active, 1); INIT_LIST_HEAD(&bundle->waiting_calls); trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_new); write_lock(&bundle->local->rxnet->conn_lock); list_add_tail(&bundle->proc_link, &bundle->local->rxnet->bundle_proc_list); write_unlock(&bundle->local->rxnet->conn_lock); } return bundle; } struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle, enum rxrpc_bundle_trace why) { int r; __refcount_inc(&bundle->ref, &r); trace_rxrpc_bundle(bundle->debug_id, r + 1, why); return bundle; } static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { trace_rxrpc_bundle(bundle->debug_id, refcount_read(&bundle->ref), rxrpc_bundle_free); write_lock(&bundle->local->rxnet->conn_lock); list_del(&bundle->proc_link); write_unlock(&bundle->local->rxnet->conn_lock); rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); key_put(bundle->key); kfree(bundle); } void rxrpc_put_bundle(struct rxrpc_bundle *bundle, enum rxrpc_bundle_trace why) { unsigned int id; bool dead; int r; if (bundle) { id = bundle->debug_id; dead = __refcount_dec_and_test(&bundle->ref, &r); trace_rxrpc_bundle(id, r - 1, why); if (dead) rxrpc_free_bundle(bundle); } } /* * Get rid of outstanding client connection preallocations when a local * endpoint is destroyed. */ void rxrpc_purge_client_connections(struct rxrpc_local *local) { rxrpc_destroy_client_conn_ids(local); } /* * Allocate a client connection. */ static struct rxrpc_connection * rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle) { struct rxrpc_connection *conn; struct rxrpc_local *local = bundle->local; struct rxrpc_net *rxnet = local->rxnet; int id; _enter(""); conn = rxrpc_alloc_connection(rxnet, GFP_ATOMIC | __GFP_NOWARN); if (!conn) return ERR_PTR(-ENOMEM); id = idr_alloc_cyclic(&local->conn_ids, conn, 1, 0x40000000, GFP_ATOMIC | __GFP_NOWARN); if (id < 0) { kfree(conn); return ERR_PTR(id); } refcount_set(&conn->ref, 1); conn->proto.cid = id << RXRPC_CIDSHIFT; conn->proto.epoch = local->rxnet->epoch; conn->out_clientflag = RXRPC_CLIENT_INITIATED; conn->bundle = rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_conn); conn->local = rxrpc_get_local(bundle->local, rxrpc_local_get_client_conn); conn->peer = rxrpc_get_peer(bundle->peer, rxrpc_peer_get_client_conn); conn->key = key_get(bundle->key); conn->security = bundle->security; conn->exclusive = bundle->exclusive; conn->upgrade = bundle->upgrade; conn->orig_service_id = bundle->service_id; conn->security_level = bundle->security_level; conn->state = RXRPC_CONN_CLIENT_UNSECURED; conn->service_id = conn->orig_service_id; if (conn->security == &rxrpc_no_security) conn->state = RXRPC_CONN_CLIENT; atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); rxrpc_see_connection(conn, rxrpc_conn_new_client); atomic_inc(&rxnet->nr_client_conns); trace_rxrpc_client(conn, -1, rxrpc_client_alloc); return conn; } /* * Determine if a connection may be reused. */ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) { struct rxrpc_net *rxnet; int id_cursor, id, distance, limit; if (!conn) goto dont_reuse; rxnet = conn->rxnet; if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags)) goto dont_reuse; if ((conn->state != RXRPC_CONN_CLIENT_UNSECURED && conn->state != RXRPC_CONN_CLIENT) || conn->proto.epoch != rxnet->epoch) goto mark_dont_reuse; /* The IDR tree gets very expensive on memory if the connection IDs are * widely scattered throughout the number space, so we shall want to * kill off connections that, say, have an ID more than about four * times the maximum number of client conns away from the current * allocation point to try and keep the IDs concentrated. */ id_cursor = idr_get_cursor(&conn->local->conn_ids); id = conn->proto.cid >> RXRPC_CIDSHIFT; distance = id - id_cursor; if (distance < 0) distance = -distance; limit = umax(atomic_read(&rxnet->nr_conns) * 4, 1024); if (distance > limit) goto mark_dont_reuse; return true; mark_dont_reuse: set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); dont_reuse: return false; } /* * Look up the conn bundle that matches the connection parameters, adding it if * it doesn't yet exist. */ int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) { struct rxrpc_bundle *bundle, *candidate; struct rxrpc_local *local = call->local; struct rb_node *p, **pp, *parent; long diff; bool upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); _enter("{%px,%x,%u,%u}", call->peer, key_serial(call->key), call->security_level, upgrade); if (test_bit(RXRPC_CALL_EXCLUSIVE, &call->flags)) { call->bundle = rxrpc_alloc_bundle(call, gfp); return call->bundle ? 0 : -ENOMEM; } /* First, see if the bundle is already there. */ _debug("search 1"); spin_lock(&local->client_bundles_lock); p = local->client_bundles.rb_node; while (p) { bundle = rb_entry(p, struct rxrpc_bundle, local_node); #define cmp(X, Y) ((long)(X) - (long)(Y)) diff = (cmp(bundle->peer, call->peer) ?: cmp(bundle->key, call->key) ?: cmp(bundle->security_level, call->security_level) ?: cmp(bundle->upgrade, upgrade)); #undef cmp if (diff < 0) p = p->rb_left; else if (diff > 0) p = p->rb_right; else goto found_bundle; } spin_unlock(&local->client_bundles_lock); _debug("not found"); /* It wasn't. We need to add one. */ candidate = rxrpc_alloc_bundle(call, gfp); if (!candidate) return -ENOMEM; _debug("search 2"); spin_lock(&local->client_bundles_lock); pp = &local->client_bundles.rb_node; parent = NULL; while (*pp) { parent = *pp; bundle = rb_entry(parent, struct rxrpc_bundle, local_node); #define cmp(X, Y) ((long)(X) - (long)(Y)) diff = (cmp(bundle->peer, call->peer) ?: cmp(bundle->key, call->key) ?: cmp(bundle->security_level, call->security_level) ?: cmp(bundle->upgrade, upgrade)); #undef cmp if (diff < 0) pp = &(*pp)->rb_left; else if (diff > 0) pp = &(*pp)->rb_right; else goto found_bundle_free; } _debug("new bundle"); rb_link_node(&candidate->local_node, parent, pp); rb_insert_color(&candidate->local_node, &local->client_bundles); call->bundle = rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call); spin_unlock(&local->client_bundles_lock); _leave(" = B=%u [new]", call->bundle->debug_id); return 0; found_bundle_free: rxrpc_free_bundle(candidate); found_bundle: call->bundle = rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_call); rxrpc_activate_bundle(bundle); spin_unlock(&local->client_bundles_lock); _leave(" = B=%u [found]", call->bundle->debug_id); return 0; } /* * Allocate a new connection and add it into a bundle. */ static bool rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, unsigned int slot) { struct rxrpc_connection *conn, *old; unsigned int shift = slot * RXRPC_MAXCALLS; unsigned int i; old = bundle->conns[slot]; if (old) { bundle->conns[slot] = NULL; bundle->conn_ids[slot] = 0; trace_rxrpc_client(old, -1, rxrpc_client_replace); rxrpc_put_connection(old, rxrpc_conn_put_noreuse); } conn = rxrpc_alloc_client_connection(bundle); if (IS_ERR(conn)) { bundle->alloc_error = PTR_ERR(conn); return false; } rxrpc_activate_bundle(bundle); conn->bundle_shift = shift; bundle->conns[slot] = conn; bundle->conn_ids[slot] = conn->debug_id; for (i = 0; i < RXRPC_MAXCALLS; i++) set_bit(shift + i, &bundle->avail_chans); return true; } /* * Add a connection to a bundle if there are no usable connections or we have * connections waiting for extra capacity. */ static bool rxrpc_bundle_has_space(struct rxrpc_bundle *bundle) { int slot = -1, i, usable; _enter(""); bundle->alloc_error = 0; /* See if there are any usable connections. */ usable = 0; for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) { if (rxrpc_may_reuse_conn(bundle->conns[i])) usable++; else if (slot == -1) slot = i; } if (!usable && bundle->upgrade) bundle->try_upgrade = true; if (!usable) goto alloc_conn; if (!bundle->avail_chans && !bundle->try_upgrade && usable < ARRAY_SIZE(bundle->conns)) goto alloc_conn; _leave(""); return usable; alloc_conn: return slot >= 0 ? rxrpc_add_conn_to_bundle(bundle, slot) : false; } /* * Assign a channel to the call at the front of the queue and wake the call up. * We don't increment the callNumber counter until this number has been exposed * to the world. */ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, unsigned int channel) { struct rxrpc_channel *chan = &conn->channels[channel]; struct rxrpc_bundle *bundle = conn->bundle; struct rxrpc_call *call = list_entry(bundle->waiting_calls.next, struct rxrpc_call, wait_link); u32 call_id = chan->call_counter + 1; _enter("C=%x,%u", conn->debug_id, channel); list_del_init(&call->wait_link); trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); /* Cancel the final ACK on the previous call if it hasn't been sent yet * as the DATA packet will implicitly ACK it. */ clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); clear_bit(conn->bundle_shift + channel, &bundle->avail_chans); rxrpc_see_call(call, rxrpc_call_see_activate_client); call->conn = rxrpc_get_connection(conn, rxrpc_conn_get_activate_call); call->cid = conn->proto.cid | channel; call->call_id = call_id; call->dest_srx.srx_service = conn->service_id; call->cong_ssthresh = call->peer->cong_ssthresh; if (call->cong_cwnd >= call->cong_ssthresh) call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; else call->cong_ca_state = RXRPC_CA_SLOW_START; chan->call_id = call_id; chan->call_debug_id = call->debug_id; chan->call = call; rxrpc_see_call(call, rxrpc_call_see_connected); trace_rxrpc_connect_call(call); call->tx_last_sent = ktime_get_real(); rxrpc_start_call_timer(call); rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_SEND_REQUEST); wake_up(&call->waitq); } /* * Remove a connection from the idle list if it's on it. */ static void rxrpc_unidle_conn(struct rxrpc_connection *conn) { if (!list_empty(&conn->cache_link)) { list_del_init(&conn->cache_link); rxrpc_put_connection(conn, rxrpc_conn_put_unidle); } } /* * Assign channels and callNumbers to waiting calls. */ static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) { struct rxrpc_connection *conn; unsigned long avail, mask; unsigned int channel, slot; trace_rxrpc_client(NULL, -1, rxrpc_client_activate_chans); if (bundle->try_upgrade) mask = 1; else mask = ULONG_MAX; while (!list_empty(&bundle->waiting_calls)) { avail = bundle->avail_chans & mask; if (!avail) break; channel = __ffs(avail); clear_bit(channel, &bundle->avail_chans); slot = channel / RXRPC_MAXCALLS; conn = bundle->conns[slot]; if (!conn) break; if (bundle->try_upgrade) set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags); rxrpc_unidle_conn(conn); channel &= (RXRPC_MAXCALLS - 1); conn->act_chans |= 1 << channel; rxrpc_activate_one_channel(conn, channel); } } /* * Connect waiting channels (called from the I/O thread). */ void rxrpc_connect_client_calls(struct rxrpc_local *local) { struct rxrpc_call *call; LIST_HEAD(new_client_calls); spin_lock_irq(&local->client_call_lock); list_splice_tail_init(&local->new_client_calls, &new_client_calls); spin_unlock_irq(&local->client_call_lock); while ((call = list_first_entry_or_null(&new_client_calls, struct rxrpc_call, wait_link))) { struct rxrpc_bundle *bundle = call->bundle; list_move_tail(&call->wait_link, &bundle->waiting_calls); rxrpc_see_call(call, rxrpc_call_see_waiting_call); if (rxrpc_bundle_has_space(bundle)) rxrpc_activate_channels(bundle); } } /* * Note that a call, and thus a connection, is about to be exposed to the * world. */ void rxrpc_expose_client_call(struct rxrpc_call *call) { unsigned int channel = call->cid & RXRPC_CHANNELMASK; struct rxrpc_connection *conn = call->conn; struct rxrpc_channel *chan = &conn->channels[channel]; if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) { /* Mark the call ID as being used. If the callNumber counter * exceeds ~2 billion, we kill the connection after its * outstanding calls have finished so that the counter doesn't * wrap. */ chan->call_counter++; if (chan->call_counter >= INT_MAX) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); trace_rxrpc_client(conn, channel, rxrpc_client_exposed); spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); spin_unlock_irq(&call->peer->lock); } } /* * Set the reap timer. */ static void rxrpc_set_client_reap_timer(struct rxrpc_local *local) { if (!local->kill_all_client_conns) { unsigned long now = jiffies; unsigned long reap_at = now + rxrpc_conn_idle_client_expiry; if (local->rxnet->live) timer_reduce(&local->client_conn_reap_timer, reap_at); } } /* * Disconnect a client call. */ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call *call) { struct rxrpc_connection *conn; struct rxrpc_channel *chan = NULL; struct rxrpc_local *local = bundle->local; unsigned int channel; bool may_reuse; u32 cid; _enter("c=%x", call->debug_id); /* Calls that have never actually been assigned a channel can simply be * discarded. */ conn = call->conn; if (!conn) { _debug("call is waiting"); ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); /* May still be on ->new_client_calls. */ spin_lock_irq(&local->client_call_lock); list_del_init(&call->wait_link); spin_unlock_irq(&local->client_call_lock); return; } cid = call->cid; channel = cid & RXRPC_CHANNELMASK; chan = &conn->channels[channel]; trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); if (WARN_ON(chan->call != call)) return; may_reuse = rxrpc_may_reuse_conn(conn); /* If a client call was exposed to the world, we save the result for * retransmission. * * We use a barrier here so that the call number and abort code can be * read without needing to take a lock. * * TODO: Make the incoming packet handler check this and handle * terminal retransmission without requiring access to the call. */ if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { _debug("exposed %u,%u", call->call_id, call->abort_code); __rxrpc_disconnect_call(conn, call); if (test_and_clear_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) { trace_rxrpc_client(conn, channel, rxrpc_client_to_active); bundle->try_upgrade = false; if (may_reuse) rxrpc_activate_channels(bundle); } } /* See if we can pass the channel directly to another call. */ if (may_reuse && !list_empty(&bundle->waiting_calls)) { trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); rxrpc_activate_one_channel(conn, channel); return; } /* Schedule the final ACK to be transmitted in a short while so that it * can be skipped if we find a follow-on call. The first DATA packet * of the follow on call will implicitly ACK this call. */ if (call->completion == RXRPC_CALL_SUCCEEDED && test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { unsigned long final_ack_at = jiffies + 2; chan->final_ack_at = final_ack_at; smp_wmb(); /* vs rxrpc_process_delayed_final_acks() */ set_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); rxrpc_reduce_conn_timer(conn, final_ack_at); } /* Deactivate the channel. */ chan->call = NULL; set_bit(conn->bundle_shift + channel, &conn->bundle->avail_chans); conn->act_chans &= ~(1 << channel); /* If no channels remain active, then put the connection on the idle * list for a short while. Give it a ref to stop it going away if it * becomes unbundled. */ if (!conn->act_chans) { trace_rxrpc_client(conn, channel, rxrpc_client_to_idle); conn->idle_timestamp = jiffies; rxrpc_get_connection(conn, rxrpc_conn_get_idle); list_move_tail(&conn->cache_link, &local->idle_client_conns); rxrpc_set_client_reap_timer(local); } } /* * Remove a connection from a bundle. */ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) { struct rxrpc_bundle *bundle = conn->bundle; unsigned int bindex; int i; _enter("C=%x", conn->debug_id); if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, true); bindex = conn->bundle_shift / RXRPC_MAXCALLS; if (bundle->conns[bindex] == conn) { _debug("clear slot %u", bindex); bundle->conns[bindex] = NULL; bundle->conn_ids[bindex] = 0; for (i = 0; i < RXRPC_MAXCALLS; i++) clear_bit(conn->bundle_shift + i, &bundle->avail_chans); rxrpc_put_client_connection_id(bundle->local, conn); rxrpc_deactivate_bundle(bundle); rxrpc_put_connection(conn, rxrpc_conn_put_unbundle); } } /* * Drop the active count on a bundle. */ void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) { struct rxrpc_local *local; bool need_put = false; if (!bundle) return; local = bundle->local; if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) { if (!bundle->exclusive) { _debug("erase bundle"); rb_erase(&bundle->local_node, &local->client_bundles); need_put = true; } spin_unlock(&local->client_bundles_lock); if (need_put) rxrpc_put_bundle(bundle, rxrpc_bundle_put_discard); } } /* * Clean up a dead client connection. */ void rxrpc_kill_client_conn(struct rxrpc_connection *conn) { struct rxrpc_local *local = conn->local; struct rxrpc_net *rxnet = local->rxnet; _enter("C=%x", conn->debug_id); trace_rxrpc_client(conn, -1, rxrpc_client_cleanup); atomic_dec(&rxnet->nr_client_conns); rxrpc_put_client_connection_id(local, conn); } /* * Discard expired client connections from the idle list. Each conn in the * idle list has been exposed and holds an extra ref because of that. * * This may be called from conn setup or from a work item so cannot be * considered non-reentrant. */ void rxrpc_discard_expired_client_conns(struct rxrpc_local *local) { struct rxrpc_connection *conn; unsigned long expiry, conn_expires_at, now; unsigned int nr_conns; _enter(""); /* We keep an estimate of what the number of conns ought to be after * we've discarded some so that we don't overdo the discarding. */ nr_conns = atomic_read(&local->rxnet->nr_client_conns); next: conn = list_first_entry_or_null(&local->idle_client_conns, struct rxrpc_connection, cache_link); if (!conn) return; if (!local->kill_all_client_conns) { /* If the number of connections is over the reap limit, we * expedite discard by reducing the expiry timeout. We must, * however, have at least a short grace period to be able to do * final-ACK or ABORT retransmission. */ expiry = rxrpc_conn_idle_client_expiry; if (nr_conns > rxrpc_reap_client_connections) expiry = rxrpc_conn_idle_client_fast_expiry; if (conn->local->service_closed) expiry = rxrpc_closed_conn_expiry * HZ; conn_expires_at = conn->idle_timestamp + expiry; now = jiffies; if (time_after(conn_expires_at, now)) goto not_yet_expired; } atomic_dec(&conn->active); trace_rxrpc_client(conn, -1, rxrpc_client_discard); list_del_init(&conn->cache_link); rxrpc_unbundle_conn(conn); /* Drop the ->cache_link ref */ rxrpc_put_connection(conn, rxrpc_conn_put_discard_idle); nr_conns--; goto next; not_yet_expired: /* The connection at the front of the queue hasn't yet expired, so * schedule the work item for that point if we discarded something. * * We don't worry if the work item is already scheduled - it can look * after rescheduling itself at a later time. We could cancel it, but * then things get messier. */ _debug("not yet"); if (!local->kill_all_client_conns) timer_reduce(&local->client_conn_reap_timer, conn_expires_at); _leave(""); } /* * Clean up the client connections on a local endpoint. */ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) { struct rxrpc_connection *conn; _enter(""); local->kill_all_client_conns = true; del_timer_sync(&local->client_conn_reap_timer); while ((conn = list_first_entry_or_null(&local->idle_client_conns, struct rxrpc_connection, cache_link))) { list_del_init(&conn->cache_link); atomic_dec(&conn->active); trace_rxrpc_client(conn, -1, rxrpc_client_discard); rxrpc_unbundle_conn(conn); rxrpc_put_connection(conn, rxrpc_conn_put_local_dead); } _leave(" [culled]"); }
6 6 6 6 6 6 6 6 1 6 2 6 6 2 6 6 5 5 5 4 5 8 3 2 2 1 7 8 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 // SPDX-License-Identifier: GPL-2.0-or-later /* * PRNG: Pseudo Random Number Generator * Based on NIST Recommended PRNG From ANSI X9.31 Appendix A.2.4 using * AES 128 cipher * * (C) Neil Horman <nhorman@tuxdriver.com> */ #include <crypto/internal/cipher.h> #include <crypto/internal/rng.h> #include <linux/err.h> #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/string.h> #define DEFAULT_PRNG_KEY "0123456789abcdef" #define DEFAULT_PRNG_KSZ 16 #define DEFAULT_BLK_SZ 16 #define DEFAULT_V_SEED "zaybxcwdveuftgsh" /* * Flags for the prng_context flags field */ #define PRNG_FIXED_SIZE 0x1 #define PRNG_NEED_RESET 0x2 /* * Note: DT is our counter value * I is our intermediate value * V is our seed vector * See http://csrc.nist.gov/groups/STM/cavp/documents/rng/931rngext.pdf * for implementation details */ struct prng_context { spinlock_t prng_lock; unsigned char rand_data[DEFAULT_BLK_SZ]; unsigned char last_rand_data[DEFAULT_BLK_SZ]; unsigned char DT[DEFAULT_BLK_SZ]; unsigned char I[DEFAULT_BLK_SZ]; unsigned char V[DEFAULT_BLK_SZ]; u32 rand_data_valid; struct crypto_cipher *tfm; u32 flags; }; static int dbg; static void hexdump(char *note, unsigned char *buf, unsigned int len) { if (dbg) { printk(KERN_CRIT "%s", note); print_hex_dump(KERN_CONT, "", DUMP_PREFIX_OFFSET, 16, 1, buf, len, false); } } #define dbgprint(format, args...) do {\ if (dbg)\ printk(format, ##args);\ } while (0) static void xor_vectors(unsigned char *in1, unsigned char *in2, unsigned char *out, unsigned int size) { int i; for (i = 0; i < size; i++) out[i] = in1[i] ^ in2[i]; } /* * Returns DEFAULT_BLK_SZ bytes of random data per call * returns 0 if generation succeeded, <0 if something went wrong */ static int _get_more_prng_bytes(struct prng_context *ctx, int cont_test) { int i; unsigned char tmp[DEFAULT_BLK_SZ]; unsigned char *output = NULL; dbgprint(KERN_CRIT "Calling _get_more_prng_bytes for context %p\n", ctx); hexdump("Input DT: ", ctx->DT, DEFAULT_BLK_SZ); hexdump("Input I: ", ctx->I, DEFAULT_BLK_SZ); hexdump("Input V: ", ctx->V, DEFAULT_BLK_SZ); /* * This algorithm is a 3 stage state machine */ for (i = 0; i < 3; i++) { switch (i) { case 0: /* * Start by encrypting the counter value * This gives us an intermediate value I */ memcpy(tmp, ctx->DT, DEFAULT_BLK_SZ); output = ctx->I; hexdump("tmp stage 0: ", tmp, DEFAULT_BLK_SZ); break; case 1: /* * Next xor I with our secret vector V * encrypt that result to obtain our * pseudo random data which we output */ xor_vectors(ctx->I, ctx->V, tmp, DEFAULT_BLK_SZ); hexdump("tmp stage 1: ", tmp, DEFAULT_BLK_SZ); output = ctx->rand_data; break; case 2: /* * First check that we didn't produce the same * random data that we did last time around through this */ if (!memcmp(ctx->rand_data, ctx->last_rand_data, DEFAULT_BLK_SZ)) { if (cont_test) { panic("cprng %p Failed repetition check!\n", ctx); } printk(KERN_ERR "ctx %p Failed repetition check!\n", ctx); ctx->flags |= PRNG_NEED_RESET; return -EINVAL; } memcpy(ctx->last_rand_data, ctx->rand_data, DEFAULT_BLK_SZ); /* * Lastly xor the random data with I * and encrypt that to obtain a new secret vector V */ xor_vectors(ctx->rand_data, ctx->I, tmp, DEFAULT_BLK_SZ); output = ctx->V; hexdump("tmp stage 2: ", tmp, DEFAULT_BLK_SZ); break; } /* do the encryption */ crypto_cipher_encrypt_one(ctx->tfm, output, tmp); } /* * Now update our DT value */ for (i = DEFAULT_BLK_SZ - 1; i >= 0; i--) { ctx->DT[i] += 1; if (ctx->DT[i] != 0) break; } dbgprint("Returning new block for context %p\n", ctx); ctx->rand_data_valid = 0; hexdump("Output DT: ", ctx->DT, DEFAULT_BLK_SZ); hexdump("Output I: ", ctx->I, DEFAULT_BLK_SZ); hexdump("Output V: ", ctx->V, DEFAULT_BLK_SZ); hexdump("New Random Data: ", ctx->rand_data, DEFAULT_BLK_SZ); return 0; } /* Our exported functions */ static int get_prng_bytes(char *buf, size_t nbytes, struct prng_context *ctx, int do_cont_test) { unsigned char *ptr = buf; unsigned int byte_count = (unsigned int)nbytes; int err; spin_lock_bh(&ctx->prng_lock); err = -EINVAL; if (ctx->flags & PRNG_NEED_RESET) goto done; /* * If the FIXED_SIZE flag is on, only return whole blocks of * pseudo random data */ err = -EINVAL; if (ctx->flags & PRNG_FIXED_SIZE) { if (nbytes < DEFAULT_BLK_SZ) goto done; byte_count = DEFAULT_BLK_SZ; } /* * Return 0 in case of success as mandated by the kernel * crypto API interface definition. */ err = 0; dbgprint(KERN_CRIT "getting %d random bytes for context %p\n", byte_count, ctx); remainder: if (ctx->rand_data_valid == DEFAULT_BLK_SZ) { if (_get_more_prng_bytes(ctx, do_cont_test) < 0) { memset(buf, 0, nbytes); err = -EINVAL; goto done; } } /* * Copy any data less than an entire block */ if (byte_count < DEFAULT_BLK_SZ) { empty_rbuf: while (ctx->rand_data_valid < DEFAULT_BLK_SZ) { *ptr = ctx->rand_data[ctx->rand_data_valid]; ptr++; byte_count--; ctx->rand_data_valid++; if (byte_count == 0) goto done; } } /* * Now copy whole blocks */ for (; byte_count >= DEFAULT_BLK_SZ; byte_count -= DEFAULT_BLK_SZ) { if (ctx->rand_data_valid == DEFAULT_BLK_SZ) { if (_get_more_prng_bytes(ctx, do_cont_test) < 0) { memset(buf, 0, nbytes); err = -EINVAL; goto done; } } if (ctx->rand_data_valid > 0) goto empty_rbuf; memcpy(ptr, ctx->rand_data, DEFAULT_BLK_SZ); ctx->rand_data_valid += DEFAULT_BLK_SZ; ptr += DEFAULT_BLK_SZ; } /* * Now go back and get any remaining partial block */ if (byte_count) goto remainder; done: spin_unlock_bh(&ctx->prng_lock); dbgprint(KERN_CRIT "returning %d from get_prng_bytes in context %p\n", err, ctx); return err; } static void free_prng_context(struct prng_context *ctx) { crypto_free_cipher(ctx->tfm); } static int reset_prng_context(struct prng_context *ctx, const unsigned char *key, size_t klen, const unsigned char *V, const unsigned char *DT) { int ret; const unsigned char *prng_key; spin_lock_bh(&ctx->prng_lock); ctx->flags |= PRNG_NEED_RESET; prng_key = (key != NULL) ? key : (unsigned char *)DEFAULT_PRNG_KEY; if (!key) klen = DEFAULT_PRNG_KSZ; if (V) memcpy(ctx->V, V, DEFAULT_BLK_SZ); else memcpy(ctx->V, DEFAULT_V_SEED, DEFAULT_BLK_SZ); if (DT) memcpy(ctx->DT, DT, DEFAULT_BLK_SZ); else memset(ctx->DT, 0, DEFAULT_BLK_SZ); memset(ctx->rand_data, 0, DEFAULT_BLK_SZ); memset(ctx->last_rand_data, 0, DEFAULT_BLK_SZ); ctx->rand_data_valid = DEFAULT_BLK_SZ; ret = crypto_cipher_setkey(ctx->tfm, prng_key, klen); if (ret) { dbgprint(KERN_CRIT "PRNG: setkey() failed flags=%x\n", crypto_cipher_get_flags(ctx->tfm)); goto out; } ret = 0; ctx->flags &= ~PRNG_NEED_RESET; out: spin_unlock_bh(&ctx->prng_lock); return ret; } static int cprng_init(struct crypto_tfm *tfm) { struct prng_context *ctx = crypto_tfm_ctx(tfm); spin_lock_init(&ctx->prng_lock); ctx->tfm = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(ctx->tfm)) { dbgprint(KERN_CRIT "Failed to alloc tfm for context %p\n", ctx); return PTR_ERR(ctx->tfm); } if (reset_prng_context(ctx, NULL, DEFAULT_PRNG_KSZ, NULL, NULL) < 0) return -EINVAL; /* * after allocation, we should always force the user to reset * so they don't inadvertently use the insecure default values * without specifying them intentially */ ctx->flags |= PRNG_NEED_RESET; return 0; } static void cprng_exit(struct crypto_tfm *tfm) { free_prng_context(crypto_tfm_ctx(tfm)); } static int cprng_get_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) { struct prng_context *prng = crypto_rng_ctx(tfm); return get_prng_bytes(rdata, dlen, prng, 0); } /* * This is the cprng_registered reset method the seed value is * interpreted as the tuple { V KEY DT} * V and KEY are required during reset, and DT is optional, detected * as being present by testing the length of the seed */ static int cprng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { struct prng_context *prng = crypto_rng_ctx(tfm); const u8 *key = seed + DEFAULT_BLK_SZ; const u8 *dt = NULL; if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ) return -EINVAL; if (slen >= (2 * DEFAULT_BLK_SZ + DEFAULT_PRNG_KSZ)) dt = key + DEFAULT_PRNG_KSZ; reset_prng_context(prng, key, DEFAULT_PRNG_KSZ, seed, dt); if (prng->flags & PRNG_NEED_RESET) return -EINVAL; return 0; } #ifdef CONFIG_CRYPTO_FIPS static int fips_cprng_get_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) { struct prng_context *prng = crypto_rng_ctx(tfm); return get_prng_bytes(rdata, dlen, prng, 1); } static int fips_cprng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { u8 rdata[DEFAULT_BLK_SZ]; const u8 *key = seed + DEFAULT_BLK_SZ; int rc; struct prng_context *prng = crypto_rng_ctx(tfm); if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ) return -EINVAL; /* fips strictly requires seed != key */ if (!memcmp(seed, key, DEFAULT_PRNG_KSZ)) return -EINVAL; rc = cprng_reset(tfm, seed, slen); if (!rc) goto out; /* this primes our continuity test */ rc = get_prng_bytes(rdata, DEFAULT_BLK_SZ, prng, 0); prng->rand_data_valid = DEFAULT_BLK_SZ; out: return rc; } #endif static struct rng_alg rng_algs[] = { { .generate = cprng_get_random, .seed = cprng_reset, .seedsize = DEFAULT_PRNG_KSZ + 2 * DEFAULT_BLK_SZ, .base = { .cra_name = "stdrng", .cra_driver_name = "ansi_cprng", .cra_priority = 100, .cra_ctxsize = sizeof(struct prng_context), .cra_module = THIS_MODULE, .cra_init = cprng_init, .cra_exit = cprng_exit, } #ifdef CONFIG_CRYPTO_FIPS }, { .generate = fips_cprng_get_random, .seed = fips_cprng_reset, .seedsize = DEFAULT_PRNG_KSZ + 2 * DEFAULT_BLK_SZ, .base = { .cra_name = "fips(ansi_cprng)", .cra_driver_name = "fips_ansi_cprng", .cra_priority = 300, .cra_ctxsize = sizeof(struct prng_context), .cra_module = THIS_MODULE, .cra_init = cprng_init, .cra_exit = cprng_exit, } #endif } }; /* Module initalization */ static int __init prng_mod_init(void) { return crypto_register_rngs(rng_algs, ARRAY_SIZE(rng_algs)); } static void __exit prng_mod_fini(void) { crypto_unregister_rngs(rng_algs, ARRAY_SIZE(rng_algs)); } MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Software Pseudo Random Number Generator"); MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); module_param(dbg, int, 0); MODULE_PARM_DESC(dbg, "Boolean to enable debugging (0/1 == off/on)"); subsys_initcall(prng_mod_init); module_exit(prng_mod_fini); MODULE_ALIAS_CRYPTO("stdrng"); MODULE_ALIAS_CRYPTO("ansi_cprng"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
32 3 1 1 1 1 1 1 3 20 8 9 5 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2022 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/kernel.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_meta.h> #include <net/netfilter/nf_tables_offload.h> #include <linux/tcp.h> #include <linux/udp.h> #include <net/gre.h> #include <net/geneve.h> #include <net/ip.h> #include <linux/icmpv6.h> #include <linux/ip.h> #include <linux/ipv6.h> static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); /* Same layout as nft_expr but it embeds the private expression data area. */ struct __nft_expr { const struct nft_expr_ops *ops; union { struct nft_payload payload; struct nft_meta meta; } __attribute__((aligned(__alignof__(u64)))); }; enum { NFT_INNER_EXPR_PAYLOAD, NFT_INNER_EXPR_META, }; struct nft_inner { u8 flags; u8 hdrsize; u8 type; u8 expr_type; struct __nft_expr expr; }; static int nft_inner_parse_l2l3(const struct nft_inner *priv, const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *ctx, u32 off) { __be16 llproto, outer_llproto; u32 nhoff, thoff; if (priv->flags & NFT_INNER_LL) { struct vlan_ethhdr *veth, _veth; struct ethhdr *eth, _eth; u32 hdrsize; eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth); if (!eth) return -1; switch (eth->h_proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): llproto = eth->h_proto; hdrsize = sizeof(_eth); break; case htons(ETH_P_8021Q): veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth); if (!veth) return -1; outer_llproto = veth->h_vlan_encapsulated_proto; llproto = veth->h_vlan_proto; hdrsize = sizeof(_veth); break; default: return -1; } ctx->inner_lloff = off; ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL; off += hdrsize; } else { struct iphdr *iph; u32 _version; iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version); if (!iph) return -1; switch (iph->version) { case 4: llproto = htons(ETH_P_IP); break; case 6: llproto = htons(ETH_P_IPV6); break; default: return -1; } } ctx->llproto = llproto; if (llproto == htons(ETH_P_8021Q)) llproto = outer_llproto; nhoff = off; switch (llproto) { case htons(ETH_P_IP): { struct iphdr *iph, _iph; iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph); if (!iph) return -1; if (iph->ihl < 5 || iph->version != 4) return -1; ctx->inner_nhoff = nhoff; ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; thoff = nhoff + (iph->ihl * 4); if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) { ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; ctx->inner_thoff = thoff; ctx->l4proto = iph->protocol; } } break; case htons(ETH_P_IPV6): { struct ipv6hdr *ip6h, _ip6h; int fh_flags = IP6_FH_F_AUTH; unsigned short fragoff; int l4proto; ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h); if (!ip6h) return -1; if (ip6h->version != 6) return -1; ctx->inner_nhoff = nhoff; ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; thoff = nhoff; l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags); if (l4proto < 0 || thoff > U16_MAX) return -1; if (fragoff == 0) { thoff = nhoff + sizeof(_ip6h); ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; ctx->inner_thoff = thoff; ctx->l4proto = l4proto; } } break; default: return -1; } return 0; } static int nft_inner_parse_tunhdr(const struct nft_inner *priv, const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *ctx, u32 *off) { if (pkt->tprot == IPPROTO_GRE) { ctx->inner_tunoff = pkt->thoff; ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; return 0; } if (pkt->tprot != IPPROTO_UDP) return -1; ctx->inner_tunoff = *off; ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; *off += priv->hdrsize; switch (priv->type) { case NFT_INNER_GENEVE: { struct genevehdr *gnvh, _gnvh; gnvh = skb_header_pointer(pkt->skb, pkt->inneroff, sizeof(_gnvh), &_gnvh); if (!gnvh) return -1; *off += gnvh->opt_len * 4; } break; default: break; } return 0; } static int nft_inner_parse(const struct nft_inner *priv, struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { u32 off = pkt->inneroff; if (priv->flags & NFT_INNER_HDRSIZE && nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0) return -1; if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) { if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0) return -1; } else if (priv->flags & NFT_INNER_TH) { tun_ctx->inner_thoff = off; tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; } tun_ctx->type = priv->type; tun_ctx->cookie = (unsigned long)pkt->skb; pkt->flags |= NFT_PKTINFO_INNER_FULL; return 0; } static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { local_bh_enable(); return false; } *tun_ctx = *this_cpu_tun_ctx; local_bh_enable(); return true; } static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt, const struct nft_inner_tun_ctx *tun_ctx) { struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); if (this_cpu_tun_ctx->cookie != tun_ctx->cookie) *this_cpu_tun_ctx = *tun_ctx; local_bh_enable(); } static bool nft_inner_parse_needed(const struct nft_inner *priv, const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { if (!(pkt->flags & NFT_PKTINFO_INNER_FULL)) return true; if (!nft_inner_restore_tun_ctx(pkt, tun_ctx)) return true; if (priv->type != tun_ctx->type) return true; return false; } static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_inner *priv = nft_expr_priv(expr); struct nft_inner_tun_ctx tun_ctx = {}; if (nft_payload_inner_offset(pkt) < 0) goto err; if (nft_inner_parse_needed(priv, pkt, &tun_ctx) && nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0) goto err; switch (priv->expr_type) { case NFT_INNER_EXPR_PAYLOAD: nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); break; case NFT_INNER_EXPR_META: nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); break; default: WARN_ON_ONCE(1); goto err; } nft_inner_save_tun_ctx(pkt, &tun_ctx); return; err: regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = { [NFTA_INNER_NUM] = { .type = NLA_U32 }, [NFTA_INNER_FLAGS] = { .type = NLA_U32 }, [NFTA_INNER_HDRSIZE] = { .type = NLA_U32 }, [NFTA_INNER_TYPE] = { .type = NLA_U32 }, [NFTA_INNER_EXPR] = { .type = NLA_NESTED }, }; struct nft_expr_info { const struct nft_expr_ops *ops; const struct nlattr *attr; struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; }; static int nft_inner_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_inner *priv = nft_expr_priv(expr); u32 flags, hdrsize, type, num; struct nft_expr_info expr_info; int err; if (!tb[NFTA_INNER_FLAGS] || !tb[NFTA_INNER_NUM] || !tb[NFTA_INNER_HDRSIZE] || !tb[NFTA_INNER_TYPE] || !tb[NFTA_INNER_EXPR]) return -EINVAL; flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS])); if (flags & ~NFT_INNER_MASK) return -EOPNOTSUPP; num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM])); if (num != 0) return -EOPNOTSUPP; hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE])); type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE])); if (type > U8_MAX) return -EINVAL; if (flags & NFT_INNER_HDRSIZE) { if (hdrsize == 0 || hdrsize > 64) return -EOPNOTSUPP; } priv->flags = flags; priv->hdrsize = hdrsize; priv->type = type; err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info); if (err < 0) return err; priv->expr.ops = expr_info.ops; if (!strcmp(expr_info.ops->type->name, "payload")) priv->expr_type = NFT_INNER_EXPR_PAYLOAD; else if (!strcmp(expr_info.ops->type->name, "meta")) priv->expr_type = NFT_INNER_EXPR_META; else return -EINVAL; err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr, (const struct nlattr * const*)expr_info.tb); if (err < 0) return err; return 0; } static int nft_inner_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_inner *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) || nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) || nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) || nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize))) goto nla_put_failure; if (nft_expr_dump(skb, NFTA_INNER_EXPR, (struct nft_expr *)&priv->expr, reset) < 0) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nft_expr_ops nft_inner_ops = { .type = &nft_inner_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_inner)), .eval = nft_inner_eval, .init = nft_inner_init, .dump = nft_inner_dump, }; struct nft_expr_type nft_inner_type __read_mostly = { .name = "inner", .ops = &nft_inner_ops, .policy = nft_inner_policy, .maxattr = NFTA_INNER_MAX, .owner = THIS_MODULE, };
4 10 10 8 10 5 5 3 9 3 9 3 2 3 2 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 // SPDX-License-Identifier: GPL-2.0-only /* * TCP Vegas congestion control * * This is based on the congestion detection/avoidance scheme described in * Lawrence S. Brakmo and Larry L. Peterson. * "TCP Vegas: End to end congestion avoidance on a global internet." * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, * October 1995. Available from: * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps * * See http://www.cs.arizona.edu/xkernel/ for their implementation. * The main aspects that distinguish this implementation from the * Arizona Vegas implementation are: * o We do not change the loss detection or recovery mechanisms of * Linux in any way. Linux already recovers from losses quite well, * using fine-grained timers, NewReno, and FACK. * o To avoid the performance penalty imposed by increasing cwnd * only every-other RTT during slow start, we increase during * every RTT during slow start, just like Reno. * o Largely to allow continuous cwnd growth during slow start, * we use the rate at which ACKs come back as the "actual" * rate, rather than the rate at which data is sent. * o To speed convergence to the right rate, we set the cwnd * to achieve the right ("actual") rate when we exit slow start. * o To filter out the noise caused by delayed ACKs, we use the * minimum RTT sample observed during the last RTT to calculate * the actual rate. * o When the sender re-starts from idle, it waits until it has * received ACKs for an entire flight of new data before making * a cwnd adjustment decision. The original Vegas implementation * assumed senders never went idle. */ #include <linux/mm.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet_diag.h> #include <net/tcp.h> #include "tcp_vegas.h" static int alpha = 2; static int beta = 4; static int gamma = 1; module_param(alpha, int, 0644); MODULE_PARM_DESC(alpha, "lower bound of packets in network"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "upper bound of packets in network"); module_param(gamma, int, 0644); MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); /* There are several situations when we must "re-start" Vegas: * * o when a connection is established * o after an RTO * o after fast recovery * o when we send a packet and there is no outstanding * unacknowledged data (restarting an idle connection) * * In these circumstances we cannot do a Vegas calculation at the * end of the first RTT, because any calculation we do is using * stale info -- both the saved cwnd and congestion feedback are * stale. * * Instead we must wait until the completion of an RTT during * which we actually receive ACKs. */ static void vegas_enable(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); /* Begin taking Vegas samples next time we send something. */ vegas->doing_vegas_now = 1; /* Set the beginning of the next send window. */ vegas->beg_snd_nxt = tp->snd_nxt; vegas->cntRTT = 0; vegas->minRTT = 0x7fffffff; } /* Stop taking Vegas samples for now. */ static inline void vegas_disable(struct sock *sk) { struct vegas *vegas = inet_csk_ca(sk); vegas->doing_vegas_now = 0; } void tcp_vegas_init(struct sock *sk) { struct vegas *vegas = inet_csk_ca(sk); vegas->baseRTT = 0x7fffffff; vegas_enable(sk); } EXPORT_SYMBOL_GPL(tcp_vegas_init); /* Do RTT sampling needed for Vegas. * Basically we: * o min-filter RTT samples from within an RTT to get the current * propagation delay + queuing delay (we are min-filtering to try to * avoid the effects of delayed ACKs) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct vegas *vegas = inet_csk_ca(sk); u32 vrtt; if (sample->rtt_us < 0) return; /* Never allow zero rtt or baseRTT */ vrtt = sample->rtt_us + 1; /* Filter to find propagation delay: */ if (vrtt < vegas->baseRTT) vegas->baseRTT = vrtt; /* Find the min RTT during the last RTT to find * the current prop. delay + queuing delay: */ vegas->minRTT = min(vegas->minRTT, vrtt); vegas->cntRTT++; } EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked); void tcp_vegas_state(struct sock *sk, u8 ca_state) { if (ca_state == TCP_CA_Open) vegas_enable(sk); else vegas_disable(sk); } EXPORT_SYMBOL_GPL(tcp_vegas_state); /* * If the connection is idle and we are restarting, * then we don't want to do any Vegas calculations * until we get fresh RTT samples. So when we * restart, we reset our Vegas state to a clean * slate. After we get acks for this flight of * packets, _then_ we can make Vegas calculations * again. */ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) tcp_vegas_init(sk); } EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) { return min(tp->snd_ssthresh, tcp_snd_cwnd(tp)); } static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) { tcp_reno_cong_avoid(sk, ack, acked); return; } if (after(ack, vegas->beg_snd_nxt)) { /* Do the Vegas once-per-RTT cwnd adjustment. */ /* Save the extent of the current window so we can use this * at the end of the next RTT. */ vegas->beg_snd_nxt = tp->snd_nxt; /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. * If we only had 2 samples total, * then that means we're getting only 1 ACK per RTT, which * means they're almost certainly delayed ACKs. * If we have 3 samples, we should be OK. */ if (vegas->cntRTT <= 2) { /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ tcp_reno_cong_avoid(sk, ack, acked); } else { u32 rtt, diff; u64 target_cwnd; /* We have enough RTT samples, so, using the Vegas * algorithm, we determine if we should increase or * decrease cwnd, and by how much. */ /* Pluck out the RTT we are using for the Vegas * calculations. This is the min RTT seen during the * last RTT. Taking the min filters out the effects * of delayed ACKs, at the cost of noticing congestion * a bit later. */ rtt = vegas->minRTT; /* Calculate the cwnd we should have, if we weren't * going too fast. * * This is: * (actual rate in segments) * baseRTT */ target_cwnd = (u64)tcp_snd_cwnd(tp) * vegas->baseRTT; do_div(target_cwnd, rtt); /* Calculate the difference between the window we had, * and the window we would like to have. This quantity * is the "Diff" from the Arizona Vegas papers. */ diff = tcp_snd_cwnd(tp) * (rtt-vegas->baseRTT) / vegas->baseRTT; if (diff > gamma && tcp_in_slow_start(tp)) { /* Going too fast. Time to slow down * and switch to congestion avoidance. */ /* Set cwnd to match the actual rate * exactly: * cwnd = (actual rate) * baseRTT * Then we add 1 because the integer * truncation robs us of full link * utilization. */ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), (u32)target_cwnd + 1)); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); } else if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { /* Congestion avoidance. */ /* Figure out where we would like cwnd * to be. */ if (diff > beta) { /* The old window was too fast, so * we slow down. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); } else if (diff < alpha) { /* We don't have enough extra packets * in the network, so speed up. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } else { /* Sending just as fast as we * should be. */ } } if (tcp_snd_cwnd(tp) < 2) tcp_snd_cwnd_set(tp, 2); else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp) tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp); tp->snd_ssthresh = tcp_current_ssthresh(sk); } /* Wipe the slate clean for the next RTT. */ vegas->cntRTT = 0; vegas->minRTT = 0x7fffffff; } /* Use normal slow start */ else if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); } /* Extract info for Tcp socket info provided via netlink. */ size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct vegas *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { info->vegas.tcpv_enabled = ca->doing_vegas_now; info->vegas.tcpv_rttcnt = ca->cntRTT; info->vegas.tcpv_rtt = ca->baseRTT; info->vegas.tcpv_minrtt = ca->minRTT; *attr = INET_DIAG_VEGASINFO; return sizeof(struct tcpvegas_info); } return 0; } EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas __read_mostly = { .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_vegas_cong_avoid, .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, .owner = THIS_MODULE, .name = "vegas", }; static int __init tcp_vegas_register(void) { BUILD_BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_vegas); return 0; } static void __exit tcp_vegas_unregister(void) { tcp_unregister_congestion_control(&tcp_vegas); } module_init(tcp_vegas_register); module_exit(tcp_vegas_unregister); MODULE_AUTHOR("Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Vegas");
5 5 14 7 2 6 5 1 5 6 5 5 4 5 5 2 2 5 5 5 14 14 2 12 6 24 25 25 1 24 5 3 2 1 1 13 13 4 9 8 6 7 7 1 6 5 5 5 4 1 1 3 3 16 16 14 1 1 8 6 10 2 6 6 1 5 19 19 1 2 4 1 1 7 6 1 1 1 2 3 1 3 1 1 4 1 1 1 1 16 16 4 1 3 3 1 1 1 2 1 1 9 9 9 1 8 6 2 8 10 10 4 6 5 1 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth SCO sockets. */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/sco.h> static bool disable_esco; static const struct proto_ops sco_sock_ops; static struct bt_sock_list sco_sk_list = { .lock = __RW_LOCK_UNLOCKED(sco_sk_list.lock) }; /* ---- SCO connections ---- */ struct sco_conn { struct hci_conn *hcon; spinlock_t lock; struct sock *sk; struct delayed_work timeout_work; unsigned int mtu; struct kref ref; }; #define sco_conn_lock(c) spin_lock(&c->lock) #define sco_conn_unlock(c) spin_unlock(&c->lock) static void sco_sock_close(struct sock *sk); static void sco_sock_kill(struct sock *sk); /* ----- SCO socket info ----- */ #define sco_pi(sk) ((struct sco_pinfo *) sk) struct sco_pinfo { struct bt_sock bt; bdaddr_t src; bdaddr_t dst; __u32 flags; __u16 setting; struct bt_codec codec; struct sco_conn *conn; }; /* ---- SCO timers ---- */ #define SCO_CONN_TIMEOUT (HZ * 40) #define SCO_DISCONN_TIMEOUT (HZ * 2) static void sco_conn_free(struct kref *ref) { struct sco_conn *conn = container_of(ref, struct sco_conn, ref); BT_DBG("conn %p", conn); if (conn->sk) sco_pi(conn->sk)->conn = NULL; if (conn->hcon) { conn->hcon->sco_data = NULL; hci_conn_drop(conn->hcon); } /* Ensure no more work items will run since hci_conn has been dropped */ disable_delayed_work_sync(&conn->timeout_work); kfree(conn); } static void sco_conn_put(struct sco_conn *conn) { if (!conn) return; BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref)); kref_put(&conn->ref, sco_conn_free); } static struct sco_conn *sco_conn_hold(struct sco_conn *conn) { BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref)); kref_get(&conn->ref); return conn; } static struct sco_conn *sco_conn_hold_unless_zero(struct sco_conn *conn) { if (!conn) return NULL; BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref)); if (!kref_get_unless_zero(&conn->ref)) return NULL; return conn; } static struct sock *sco_sock_hold(struct sco_conn *conn) { if (!conn || !bt_sock_linked(&sco_sk_list, conn->sk)) return NULL; sock_hold(conn->sk); return conn->sk; } static void sco_sock_timeout(struct work_struct *work) { struct sco_conn *conn = container_of(work, struct sco_conn, timeout_work.work); struct sock *sk; conn = sco_conn_hold_unless_zero(conn); if (!conn) return; sco_conn_lock(conn); if (!conn->hcon) { sco_conn_unlock(conn); sco_conn_put(conn); return; } sk = sco_sock_hold(conn); sco_conn_unlock(conn); sco_conn_put(conn); if (!sk) return; BT_DBG("sock %p state %d", sk, sk->sk_state); lock_sock(sk); sk->sk_err = ETIMEDOUT; sk->sk_state_change(sk); release_sock(sk); sock_put(sk); } static void sco_sock_set_timer(struct sock *sk, long timeout) { if (!sco_pi(sk)->conn) return; BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout); } static void sco_sock_clear_timer(struct sock *sk) { if (!sco_pi(sk)->conn) return; BT_DBG("sock %p state %d", sk, sk->sk_state); cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); } /* ---- SCO connections ---- */ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) { struct sco_conn *conn = hcon->sco_data; conn = sco_conn_hold_unless_zero(conn); if (conn) { if (!conn->hcon) { sco_conn_lock(conn); conn->hcon = hcon; sco_conn_unlock(conn); } return conn; } conn = kzalloc(sizeof(struct sco_conn), GFP_KERNEL); if (!conn) return NULL; kref_init(&conn->ref); spin_lock_init(&conn->lock); INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); hcon->sco_data = conn; conn->hcon = hcon; conn->mtu = hcon->mtu; if (hcon->mtu > 0) conn->mtu = hcon->mtu; else conn->mtu = 60; BT_DBG("hcon %p conn %p", hcon, conn); return conn; } /* Delete channel. * Must be called on the locked socket. */ static void sco_chan_del(struct sock *sk, int err) { struct sco_conn *conn; conn = sco_pi(sk)->conn; sco_pi(sk)->conn = NULL; BT_DBG("sk %p, conn %p, err %d", sk, conn, err); if (conn) { sco_conn_lock(conn); conn->sk = NULL; sco_conn_unlock(conn); sco_conn_put(conn); } sk->sk_state = BT_CLOSED; sk->sk_err = err; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_ZAPPED); } static void sco_conn_del(struct hci_conn *hcon, int err) { struct sco_conn *conn = hcon->sco_data; struct sock *sk; conn = sco_conn_hold_unless_zero(conn); if (!conn) return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); sco_conn_lock(conn); sk = sco_sock_hold(conn); sco_conn_unlock(conn); sco_conn_put(conn); if (!sk) { sco_conn_put(conn); return; } /* Kill socket */ lock_sock(sk); sco_sock_clear_timer(sk); sco_chan_del(sk, err); release_sock(sk); sock_put(sk); } static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) { BT_DBG("conn %p", conn); sco_pi(sk)->conn = conn; conn->sk = sk; if (parent) bt_accept_enqueue(parent, sk, true); } static int sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) { int err = 0; sco_conn_lock(conn); if (conn->sk) err = -EBUSY; else __sco_chan_add(conn, sk, parent); sco_conn_unlock(conn); return err; } static int sco_connect(struct sock *sk) { struct sco_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err, type; BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst); hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (lmp_esco_capable(hdev) && !disable_esco) type = ESCO_LINK; else type = SCO_LINK; switch (sco_pi(sk)->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev)) { err = -EOPNOTSUPP; goto unlock; } break; } hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, sco_pi(sk)->setting, &sco_pi(sk)->codec, sk->sk_sndtimeo); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } conn = sco_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = sco_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&sco_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { sco_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else { sk->sk_state = BT_CONNECT; sco_sock_set_timer(sk, sk->sk_sndtimeo); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static int sco_send_frame(struct sock *sk, struct sk_buff *skb, const struct sockcm_cookie *sockc) { struct sco_conn *conn = sco_pi(sk)->conn; int len = skb->len; /* Check outgoing MTU */ if (len > conn->mtu) return -EINVAL; BT_DBG("sk %p len %d", sk, len); hci_setup_tx_timestamp(skb, 1, sockc); hci_send_sco(conn->hcon, skb); return len; } static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb) { struct sock *sk; sco_conn_lock(conn); sk = conn->sk; sco_conn_unlock(conn); if (!sk) goto drop; BT_DBG("sk %p len %u", sk, skb->len); if (sk->sk_state != BT_CONNECTED) goto drop; if (!sock_queue_rcv_skb(sk, skb)) return; drop: kfree_skb(skb); } /* -------- Socket interface ---------- */ static struct sock *__sco_get_sock_listen_by_addr(bdaddr_t *ba) { struct sock *sk; sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (!bacmp(&sco_pi(sk)->src, ba)) return sk; } return NULL; } /* Find socket listening on source bdaddr. * Returns closest match. */ static struct sock *sco_get_sock_listen(bdaddr_t *src) { struct sock *sk = NULL, *sk1 = NULL; read_lock(&sco_sk_list.lock); sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; /* Exact match. */ if (!bacmp(&sco_pi(sk)->src, src)) break; /* Closest match */ if (!bacmp(&sco_pi(sk)->src, BDADDR_ANY)) sk1 = sk; } read_unlock(&sco_sk_list.lock); return sk ? sk : sk1; } static void sco_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); sco_conn_put(sco_pi(sk)->conn); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static void sco_sock_cleanup_listen(struct sock *parent) { struct sock *sk; BT_DBG("parent %p", parent); /* Close not yet accepted channels */ while ((sk = bt_accept_dequeue(parent, NULL))) { sco_sock_close(sk); sco_sock_kill(sk); } parent->sk_state = BT_CLOSED; sock_set_flag(parent, SOCK_ZAPPED); } /* Kill socket (only if zapped and orphan) * Must be called on unlocked socket. */ static void sco_sock_kill(struct sock *sk) { if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) return; BT_DBG("sk %p state %d", sk, sk->sk_state); /* Kill poor orphan */ bt_sock_unlink(&sco_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); sock_put(sk); } static void __sco_sock_close(struct sock *sk) { BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); switch (sk->sk_state) { case BT_LISTEN: sco_sock_cleanup_listen(sk); break; case BT_CONNECTED: case BT_CONFIG: case BT_CONNECT2: case BT_CONNECT: case BT_DISCONN: sco_chan_del(sk, ECONNRESET); break; default: sock_set_flag(sk, SOCK_ZAPPED); break; } } /* Must be called on unlocked socket. */ static void sco_sock_close(struct sock *sk) { lock_sock(sk); sco_sock_clear_timer(sk); __sco_sock_close(sk); release_sock(sk); } static void sco_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); if (parent) { sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); } } static struct proto sco_proto = { .name = "SCO", .owner = THIS_MODULE, .obj_size = sizeof(struct sco_pinfo) }; static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; sk = bt_sock_alloc(net, sock, &sco_proto, proto, prio, kern); if (!sk) return NULL; sk->sk_destruct = sco_sock_destruct; sk->sk_sndtimeo = SCO_CONN_TIMEOUT; sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; sco_pi(sk)->codec.id = BT_CODEC_CVSD; sco_pi(sk)->codec.cid = 0xffff; sco_pi(sk)->codec.vid = 0xffff; sco_pi(sk)->codec.data_path = 0x00; bt_sock_link(&sco_sk_list, sk); return sk; } static int sco_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); sock->state = SS_UNCONNECTED; if (sock->type != SOCK_SEQPACKET) return -ESOCKTNOSUPPORT; sock->ops = &sco_sock_ops; sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; sco_sock_init(sk, NULL); return 0; } static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; int err = 0; if (!addr || addr_len < sizeof(struct sockaddr_sco) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr); lock_sock(sk); if (sk->sk_state != BT_OPEN) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } bacpy(&sco_pi(sk)->src, &sa->sco_bdaddr); sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; int err; BT_DBG("sk %p", sk); if (alen < sizeof(struct sockaddr_sco) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) return -EBADFD; if (sk->sk_type != SOCK_SEQPACKET) err = -EINVAL; lock_sock(sk); /* Set destination address and psm */ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr); release_sock(sk); err = sco_connect(sk); if (err) return err; lock_sock(sk); err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); release_sock(sk); return err; } static int sco_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; bdaddr_t *src = &sco_pi(sk)->src; int err = 0; BT_DBG("sk %p backlog %d", sk, backlog); lock_sock(sk); if (sk->sk_state != BT_BOUND) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } write_lock(&sco_sk_list.lock); if (__sco_get_sock_listen_by_addr(src)) { err = -EADDRINUSE; goto unlock; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = BT_LISTEN; unlock: write_unlock(&sco_sk_list.lock); done: release_sock(sk); return err; } static int sco_sock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; long timeo; int err = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); /* Wait for an incoming connection. (wake-one). */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (1) { if (sk->sk_state != BT_LISTEN) { err = -EBADFD; break; } ch = bt_accept_dequeue(sk, newsock); if (ch) break; if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); } remove_wait_queue(sk_sleep(sk), &wait); if (err) goto done; newsock->state = SS_CONNECTED; BT_DBG("new socket %p", ch); done: release_sock(sk); return err; } static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; BT_DBG("sock %p, sk %p", sock, sk); addr->sa_family = AF_BLUETOOTH; if (peer) bacpy(&sa->sco_bdaddr, &sco_pi(sk)->dst); else bacpy(&sa->sco_bdaddr, &sco_pi(sk)->src); return sizeof(struct sockaddr_sco); } static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb; struct sockcm_cookie sockc; int err; BT_DBG("sock %p, sk %p", sock, sk); err = sock_error(sk); if (err) return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; hci_sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (err) return err; } skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); if (IS_ERR(skb)) return PTR_ERR(skb); lock_sock(sk); if (sk->sk_state == BT_CONNECTED) err = sco_send_frame(sk, skb, &sockc); else err = -ENOTCONN; release_sock(sk); if (err < 0) kfree_skb(skb); return err; } static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting) { struct hci_dev *hdev = conn->hdev; BT_DBG("conn %p", conn); conn->state = BT_CONFIG; if (!lmp_esco_capable(hdev)) { struct hci_cp_accept_conn_req cp; bacpy(&cp.bdaddr, &conn->dst); cp.role = 0x00; /* Ignored */ hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp); } else { struct hci_cp_accept_sync_conn_req cp; bacpy(&cp.bdaddr, &conn->dst); cp.pkt_type = cpu_to_le16(conn->pkt_type); cp.tx_bandwidth = cpu_to_le32(0x00001f40); cp.rx_bandwidth = cpu_to_le32(0x00001f40); cp.content_format = cpu_to_le16(setting); switch (setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (conn->pkt_type & ESCO_2EV3) cp.max_latency = cpu_to_le16(0x0008); else cp.max_latency = cpu_to_le16(0x000D); cp.retrans_effort = 0x02; break; case SCO_AIRMODE_CVSD: cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; default: /* use CVSD settings as fallback */ cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; } hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, sizeof(cp), &cp); } } static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sco_pinfo *pi = sco_pi(sk); if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH, BT_SCM_ERROR); lock_sock(sk); if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { sco_conn_defer_accept(pi->conn->hcon, pi->setting); sk->sk_state = BT_CONFIG; release_sock(sk); return 0; } release_sock(sk); return bt_sock_recvmsg(sock, msg, len, flags); } static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; struct bt_voice voice; u32 opt; struct bt_codecs *codecs; struct hci_dev *hdev; __u8 buffer[255]; BT_DBG("sk %p", sk); lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); else clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; case BT_VOICE: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } voice.setting = sco_pi(sk)->setting; err = copy_safe_from_sockptr(&voice, sizeof(voice), optval, optlen); if (err) break; sco_pi(sk)->setting = voice.setting; hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) { err = -EBADFD; break; } switch (sco_pi(sk)->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (enhanced_sync_conn_capable(hdev)) sco_pi(sk)->codec.id = BT_CODEC_TRANSPARENT; break; } hci_dev_put(hdev); break; case BT_PKT_STATUS: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); else clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; case BT_CODEC: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) { err = -EBADFD; break; } if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } if (!hdev->get_data_path_id) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } if (optlen < sizeof(struct bt_codecs) || optlen > sizeof(buffer)) { hci_dev_put(hdev); err = -EINVAL; break; } err = copy_struct_from_sockptr(buffer, sizeof(buffer), optval, optlen); if (err) { hci_dev_put(hdev); break; } codecs = (void *)buffer; if (codecs->num_codecs > 1) { hci_dev_put(hdev); err = -EINVAL; break; } sco_pi(sk)->codec = codecs->codecs[0]; hci_dev_put(hdev); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct sco_options opts; struct sco_conninfo cinfo; int err = 0; size_t len; BT_DBG("sk %p", sk); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case SCO_OPTIONS: if (sk->sk_state != BT_CONNECTED && !(sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) { err = -ENOTCONN; break; } opts.mtu = sco_pi(sk)->conn->mtu; BT_DBG("mtu %u", opts.mtu); len = min(len, sizeof(opts)); if (copy_to_user(optval, (char *)&opts, len)) err = -EFAULT; break; case SCO_CONNINFO: if (sk->sk_state != BT_CONNECTED && !(sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) { err = -ENOTCONN; break; } memset(&cinfo, 0, sizeof(cinfo)); cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle; memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3); len = min(len, sizeof(cinfo)); if (copy_to_user(optval, (char *)&cinfo, len)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, err = 0; struct bt_voice voice; u32 phys; int buf_len; struct codec_list *c; u8 num_codecs, i, __user *ptr; struct hci_dev *hdev; struct hci_codec_caps *caps; struct bt_codec codec; BT_DBG("sk %p", sk); if (level == SOL_SCO) return sco_sock_getsockopt_old(sock, optname, optval, optlen); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags), (u32 __user *)optval)) err = -EFAULT; break; case BT_VOICE: voice.setting = sco_pi(sk)->setting; len = min_t(unsigned int, len, sizeof(voice)); if (copy_to_user(optval, (char *)&voice, len)) err = -EFAULT; break; case BT_PHY: if (sk->sk_state != BT_CONNECTED) { err = -ENOTCONN; break; } phys = hci_conn_get_phy(sco_pi(sk)->conn->hcon); if (put_user(phys, (u32 __user *) optval)) err = -EFAULT; break; case BT_PKT_STATUS: if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), (int __user *)optval)) err = -EFAULT; break; case BT_SNDMTU: case BT_RCVMTU: if (sk->sk_state != BT_CONNECTED) { err = -ENOTCONN; break; } if (put_user(sco_pi(sk)->conn->mtu, (u32 __user *)optval)) err = -EFAULT; break; case BT_CODEC: num_codecs = 0; buf_len = 0; hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) { err = -EBADFD; break; } if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } if (!hdev->get_data_path_id) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } release_sock(sk); /* find total buffer size required to copy codec + caps */ hci_dev_lock(hdev); list_for_each_entry(c, &hdev->local_codecs, list) { if (c->transport != HCI_TRANSPORT_SCO_ESCO) continue; num_codecs++; for (i = 0, caps = c->caps; i < c->num_caps; i++) { buf_len += 1 + caps->len; caps = (void *)&caps->data[caps->len]; } buf_len += sizeof(struct bt_codec); } hci_dev_unlock(hdev); buf_len += sizeof(struct bt_codecs); if (buf_len > len) { hci_dev_put(hdev); return -ENOBUFS; } ptr = optval; if (put_user(num_codecs, ptr)) { hci_dev_put(hdev); return -EFAULT; } ptr += sizeof(num_codecs); /* Iterate all the codecs supported over SCO and populate * codec data */ hci_dev_lock(hdev); list_for_each_entry(c, &hdev->local_codecs, list) { if (c->transport != HCI_TRANSPORT_SCO_ESCO) continue; codec.id = c->id; codec.cid = c->cid; codec.vid = c->vid; err = hdev->get_data_path_id(hdev, &codec.data_path); if (err < 0) break; codec.num_caps = c->num_caps; if (copy_to_user(ptr, &codec, sizeof(codec))) { err = -EFAULT; break; } ptr += sizeof(codec); /* find codec capabilities data length */ len = 0; for (i = 0, caps = c->caps; i < c->num_caps; i++) { len += 1 + caps->len; caps = (void *)&caps->data[caps->len]; } /* copy codec capabilities data */ if (len && copy_to_user(ptr, c->caps, len)) { err = -EFAULT; break; } ptr += len; } hci_dev_unlock(hdev); hci_dev_put(hdev); lock_sock(sk); if (!err && put_user(buf_len, optlen)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int sco_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; sock_hold(sk); lock_sock(sk); if (!sk->sk_shutdown) { sk->sk_shutdown = SHUTDOWN_MASK; sco_sock_clear_timer(sk); __sco_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); } release_sock(sk); sock_put(sk); return err; } static int sco_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; sco_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); release_sock(sk); } sock_orphan(sk); sco_sock_kill(sk); return err; } static void sco_conn_ready(struct sco_conn *conn) { struct sock *parent; struct sock *sk = conn->sk; BT_DBG("conn %p", conn); if (sk) { lock_sock(sk); sco_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; sk->sk_state_change(sk); release_sock(sk); } else { sco_conn_lock(conn); if (!conn->hcon) { sco_conn_unlock(conn); return; } parent = sco_get_sock_listen(&conn->hcon->src); if (!parent) { sco_conn_unlock(conn); return; } lock_sock(parent); sk = sco_sock_alloc(sock_net(parent), NULL, BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { release_sock(parent); sco_conn_unlock(conn); return; } sco_sock_init(sk, parent); bacpy(&sco_pi(sk)->src, &conn->hcon->src); bacpy(&sco_pi(sk)->dst, &conn->hcon->dst); sco_conn_hold(conn); hci_conn_hold(conn->hcon); __sco_chan_add(conn, sk, parent); if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) sk->sk_state = BT_CONNECT2; else sk->sk_state = BT_CONNECTED; /* Wake up parent */ parent->sk_data_ready(parent); release_sock(parent); sco_conn_unlock(conn); } } /* ----- SCO interface with lower layer (HCI) ----- */ int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct sock *sk; int lm = 0; BT_DBG("hdev %s, bdaddr %pMR", hdev->name, bdaddr); /* Find listening sockets */ read_lock(&sco_sk_list.lock); sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (!bacmp(&sco_pi(sk)->src, &hdev->bdaddr) || !bacmp(&sco_pi(sk)->src, BDADDR_ANY)) { lm |= HCI_LM_ACCEPT; if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) *flags |= HCI_PROTO_DEFER; break; } } read_unlock(&sco_sk_list.lock); return lm; } static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) { if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) return; BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status); if (!status) { struct sco_conn *conn; conn = sco_conn_add(hcon); if (conn) { sco_conn_ready(conn); sco_conn_put(conn); } } else sco_conn_del(hcon, bt_to_errno(status)); } static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) { if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); sco_conn_del(hcon, bt_to_errno(reason)); } void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) { struct sco_conn *conn = hcon->sco_data; if (!conn) goto drop; BT_DBG("conn %p len %u", conn, skb->len); if (skb->len) { sco_recv_frame(conn, skb); return; } drop: kfree_skb(skb); } static struct hci_cb sco_cb = { .name = "SCO", .connect_cfm = sco_connect_cfm, .disconn_cfm = sco_disconn_cfm, }; static int sco_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; read_lock(&sco_sk_list.lock); sk_for_each(sk, &sco_sk_list.head) { seq_printf(f, "%pMR %pMR %d\n", &sco_pi(sk)->src, &sco_pi(sk)->dst, sk->sk_state); } read_unlock(&sco_sk_list.lock); return 0; } DEFINE_SHOW_ATTRIBUTE(sco_debugfs); static struct dentry *sco_debugfs; static const struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = sco_sock_release, .bind = sco_sock_bind, .connect = sco_sock_connect, .listen = sco_sock_listen, .accept = sco_sock_accept, .getname = sco_sock_getname, .sendmsg = sco_sock_sendmsg, .recvmsg = sco_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .gettstamp = sock_gettstamp, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = sco_sock_shutdown, .setsockopt = sco_sock_setsockopt, .getsockopt = sco_sock_getsockopt }; static const struct net_proto_family sco_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = sco_sock_create, }; int __init sco_init(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_sco) > sizeof(struct sockaddr)); err = proto_register(&sco_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_SCO, &sco_sock_family_ops); if (err < 0) { BT_ERR("SCO socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "sco", &sco_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create SCO proc file"); bt_sock_unregister(BTPROTO_SCO); goto error; } BT_INFO("SCO socket layer initialized"); hci_register_cb(&sco_cb); if (IS_ERR_OR_NULL(bt_debugfs)) return 0; sco_debugfs = debugfs_create_file("sco", 0444, bt_debugfs, NULL, &sco_debugfs_fops); return 0; error: proto_unregister(&sco_proto); return err; } void sco_exit(void) { bt_procfs_cleanup(&init_net, "sco"); debugfs_remove(sco_debugfs); hci_unregister_cb(&sco_cb); bt_sock_unregister(BTPROTO_SCO); proto_unregister(&sco_proto); } module_param(disable_esco, bool, 0644); MODULE_PARM_DESC(disable_esco, "Disable eSCO connection creation");
24 2314 2311 421 423 11 1980 1978 1985 1988 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0-only /* * fs/anon_inodes.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * * Thanks to Arnd Bergmann for code review and suggestions. * More changes for Thomas Gleixner suggestions. * */ #include <linux/cred.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/magic.h> #include <linux/anon_inodes.h> #include <linux/pseudo_fs.h> #include <linux/uaccess.h> static struct vfsmount *anon_inode_mnt __ro_after_init; static struct inode *anon_inode_inode __ro_after_init; /* * anon_inodefs_dname() is called from d_path(). */ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "anon_inode:%s", dentry->d_name.name); } static const struct dentry_operations anon_inodefs_dentry_operations = { .d_dname = anon_inodefs_dname, }; static int anon_inodefs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC); if (!ctx) return -ENOMEM; ctx->dops = &anon_inodefs_dentry_operations; return 0; } static struct file_system_type anon_inode_fs_type = { .name = "anon_inodefs", .init_fs_context = anon_inodefs_init_fs_context, .kill_sb = kill_anon_super, }; static struct inode *anon_inode_make_secure_inode( const char *name, const struct inode *context_inode) { struct inode *inode; int error; inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); if (IS_ERR(inode)) return inode; inode->i_flags &= ~S_PRIVATE; error = security_inode_init_security_anon(inode, &QSTR(name), context_inode); if (error) { iput(inode); return ERR_PTR(error); } return inode; } static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, bool make_inode) { struct inode *inode; struct file *file; if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); if (make_inode) { inode = anon_inode_make_secure_inode(name, context_inode); if (IS_ERR(inode)) { file = ERR_CAST(inode); goto err; } } else { inode = anon_inode_inode; if (IS_ERR(inode)) { file = ERR_PTR(-ENODEV); goto err; } /* * We know the anon_inode inode count is always * greater than zero, so ihold() is safe. */ ihold(inode); } file = alloc_file_pseudo(inode, anon_inode_mnt, name, flags & (O_ACCMODE | O_NONBLOCK), fops); if (IS_ERR(file)) goto err_iput; file->f_mapping = inode->i_mapping; file->private_data = priv; return file; err_iput: iput(inode); err: module_put(fops->owner); return file; } /** * anon_inode_getfile - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" * of the file * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. * All the files created with anon_inode_getfile() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry * setup. Returns the newly created file* or an error pointer. */ struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags) { return __anon_inode_getfile(name, fops, priv, flags, NULL, false); } EXPORT_SYMBOL_GPL(anon_inode_getfile); /** * anon_inode_getfile_fmode - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" * of the file * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * @f_mode: [in] fmode * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. * All the files created with anon_inode_getfile() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry * setup. Allows setting the fmode. Returns the newly created file* or an error * pointer. */ struct file *anon_inode_getfile_fmode(const char *name, const struct file_operations *fops, void *priv, int flags, fmode_t f_mode) { struct file *file; file = __anon_inode_getfile(name, fops, priv, flags, NULL, false); if (!IS_ERR(file)) file->f_mode |= f_mode; return file; } EXPORT_SYMBOL_GPL(anon_inode_getfile_fmode); /** * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new * !S_PRIVATE anon inode rather than reuse the * singleton anon inode and calls the * inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * @context_inode: * [in] the logical relationship with the new inode (optional) * * Create a new anonymous inode and file pair. This can be done for two * reasons: * * - for the inode to have its own security context, so that LSMs can enforce * policy on the inode's creation; * * - if the caller needs a unique inode, for example in order to customize * the size returned by fstat() * * The LSM may use @context_inode in inode_init_security_anon(), but a * reference to it is not held. * * Returns the newly created file* or an error pointer. */ struct file *anon_inode_create_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) { return __anon_inode_getfile(name, fops, priv, flags, context_inode, true); } EXPORT_SYMBOL_GPL(anon_inode_create_getfile); static int __anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, bool make_inode) { int error, fd; struct file *file; error = get_unused_fd_flags(flags); if (error < 0) return error; fd = error; file = __anon_inode_getfile(name, fops, priv, flags, context_inode, make_inode); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_put_unused_fd; } fd_install(fd, file); return fd; err_put_unused_fd: put_unused_fd(fd); return error; } /** * anon_inode_getfd - creates a new file instance by hooking it up to * an anonymous inode and a dentry that describe * the "class" of the file * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * * Creates a new file by hooking it on a single inode. This is * useful for files that do not need to have a full-fledged inode in * order to operate correctly. All the files created with * anon_inode_getfd() will use the same singleton inode, reducing * memory use and avoiding code duplication for the file/inode/dentry * setup. Returns a newly created file descriptor or an error code. */ int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags) { return __anon_inode_getfd(name, fops, priv, flags, NULL, false); } EXPORT_SYMBOL_GPL(anon_inode_getfd); /** * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls * the inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * @context_inode: * [in] the logical relationship with the new inode (optional) * * Create a new anonymous inode and file pair. This can be done for two * reasons: * * - for the inode to have its own security context, so that LSMs can enforce * policy on the inode's creation; * * - if the caller needs a unique inode, for example in order to customize * the size returned by fstat() * * The LSM may use @context_inode in inode_init_security_anon(), but a * reference to it is not held. * * Returns a newly created file descriptor or an error code. */ int anon_inode_create_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) { return __anon_inode_getfd(name, fops, priv, flags, context_inode, true); } static int __init anon_inode_init(void) { anon_inode_mnt = kern_mount(&anon_inode_fs_type); if (IS_ERR(anon_inode_mnt)) panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt)); anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); if (IS_ERR(anon_inode_inode)) panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode)); return 0; } fs_initcall(anon_inode_init);
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BIO_INTEGRITY_H #define _LINUX_BIO_INTEGRITY_H #include <linux/bio.h> enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ }; struct bio_integrity_payload { struct bvec_iter bip_iter; unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ u16 app_tag; /* application tag value */ struct bio_vec *bip_vec; }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY #define bip_for_each_vec(bvl, bip, iter) \ for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) #define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ for_each_bio(_bio) \ bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { if (bio->bi_opf & REQ_INTEGRITY) return bio->bi_integrity; return NULL; } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); if (bip) return bip->bip_flags & flag; return false; } static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) { return bip->bip_iter.bi_sector; } static inline void bip_set_seed(struct bio_integrity_payload *bip, sector_t seed) { bip->bip_iter.bi_sector = seed; } void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, struct bio_vec *bvecs, unsigned int nr_vecs); struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { return NULL; } static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) { return -EINVAL; } static inline void bio_integrity_unmap_user(struct bio *bio) { } static inline bool bio_integrity_prep(struct bio *bio) { return true; } static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) { return 0; } static inline void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { } static inline void bio_integrity_trim(struct bio *bio) { } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; } static inline struct bio_integrity_payload * bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr) { return ERR_PTR(-EINVAL); } static inline int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* _LINUX_BIO_INTEGRITY_H */
1333 1036 225 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_BL_H #define _LINUX_RCULIST_BL_H /* * RCU-protected bl list version. See include/linux/list_bl.h. */ #include <linux/list_bl.h> #include <linux/rcupdate.h> static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); rcu_assign_pointer(h->first, (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); } /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_bl_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry(). */ static inline void hlist_bl_del_rcu(struct hlist_bl_node *n) { __hlist_bl_del(n); n->pprev = LIST_POISON2; } /** * hlist_bl_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_bl, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first; /* don't need hlist_bl_first_rcu because we're under lock */ first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; /* need _rcu because we can have concurrent lock free readers */ hlist_bl_set_first_rcu(h, n); } /** * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_bl_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_bl_node within the struct. * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = hlist_bl_first_rcu(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(pos->next)) #endif
2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Glue Code for the AVX/AES-NI/GFNI assembler implementation of the ARIA Cipher * * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> */ #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> #include <linux/types.h> #include "ecb_cbc_helpers.h" #include "aria-avx.h" asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(aria_aesni_avx_encrypt_16way); asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(aria_aesni_avx_decrypt_16way); asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); EXPORT_SYMBOL_GPL(aria_aesni_avx_ctr_crypt_16way); #ifdef CONFIG_AS_GFNI asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_encrypt_16way); asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_decrypt_16way); asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst, const u8 *src, u8 *keystream, u8 *iv); EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_ctr_crypt_16way); #endif /* CONFIG_AS_GFNI */ static struct aria_avx_ops aria_ops; struct aria_avx_request_ctx { u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE]; }; static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey) { ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way); ECB_BLOCK(1, aria_encrypt); ECB_WALK_END(); } static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey) { ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way); ECB_BLOCK(1, aria_decrypt); ECB_WALK_END(); } static int aria_avx_ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_encrypt(req, ctx->enc_key[0]); } static int aria_avx_ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_decrypt(req, ctx->dec_key[0]); } static int aria_avx_set_key(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return aria_set_key(&tfm->base, key, keylen); } static int aria_avx_ctr_encrypt(struct skcipher_request *req) { struct aria_avx_request_ctx *req_ctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) { kernel_fpu_begin(); aria_ops.aria_ctr_crypt_16way(ctx, dst, src, &req_ctx->keystream[0], walk.iv); kernel_fpu_end(); dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE; src += ARIA_AESNI_PARALLEL_BLOCK_SIZE; nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE; } while (nbytes >= ARIA_BLOCK_SIZE) { memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE); crypto_inc(walk.iv, ARIA_BLOCK_SIZE); aria_encrypt(ctx, &req_ctx->keystream[0], &req_ctx->keystream[0]); crypto_xor_cpy(dst, src, &req_ctx->keystream[0], ARIA_BLOCK_SIZE); dst += ARIA_BLOCK_SIZE; src += ARIA_BLOCK_SIZE; nbytes -= ARIA_BLOCK_SIZE; } if (walk.nbytes == walk.total && nbytes > 0) { memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE); crypto_inc(walk.iv, ARIA_BLOCK_SIZE); aria_encrypt(ctx, &req_ctx->keystream[0], &req_ctx->keystream[0]); crypto_xor_cpy(dst, src, &req_ctx->keystream[0], nbytes); dst += nbytes; src += nbytes; nbytes = 0; } err = skcipher_walk_done(&walk, nbytes); } return err; } static int aria_avx_init_tfm(struct crypto_skcipher *tfm) { crypto_skcipher_set_reqsize(tfm, sizeof(struct aria_avx_request_ctx)); return 0; } static struct skcipher_alg aria_algs[] = { { .base.cra_name = "__ecb(aria)", .base.cra_driver_name = "__ecb-aria-avx", .base.cra_priority = 400, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, .min_keysize = ARIA_MIN_KEY_SIZE, .max_keysize = ARIA_MAX_KEY_SIZE, .setkey = aria_avx_set_key, .encrypt = aria_avx_ecb_encrypt, .decrypt = aria_avx_ecb_decrypt, }, { .base.cra_name = "__ctr(aria)", .base.cra_driver_name = "__ctr-aria-avx", .base.cra_priority = 400, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, .min_keysize = ARIA_MIN_KEY_SIZE, .max_keysize = ARIA_MAX_KEY_SIZE, .ivsize = ARIA_BLOCK_SIZE, .chunksize = ARIA_BLOCK_SIZE, .walksize = 16 * ARIA_BLOCK_SIZE, .setkey = aria_avx_set_key, .encrypt = aria_avx_ctr_encrypt, .decrypt = aria_avx_ctr_encrypt, .init = aria_avx_init_tfm, } }; static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; static int __init aria_avx_init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX) || !boot_cpu_has(X86_FEATURE_AES) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } if (boot_cpu_has(X86_FEATURE_GFNI) && IS_ENABLED(CONFIG_AS_GFNI)) { aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way; aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way; aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way; } else { aria_ops.aria_encrypt_16way = aria_aesni_avx_encrypt_16way; aria_ops.aria_decrypt_16way = aria_aesni_avx_decrypt_16way; aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way; } return simd_register_skciphers_compat(aria_algs, ARRAY_SIZE(aria_algs), aria_simd_algs); } static void __exit aria_avx_exit(void) { simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), aria_simd_algs); } module_init(aria_avx_init); module_exit(aria_avx_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>"); MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX/AES-NI/GFNI optimized"); MODULE_ALIAS_CRYPTO("aria"); MODULE_ALIAS_CRYPTO("aria-aesni-avx");
5 1 5 7 2 6 2 1 1 2 2 3 8 3 8 7 3 3 2 2 13 10 3 13 3 4 9 9 1 5 7 8 5 4 1 8 8 2 12 11 11 10 6 6 6 4 29 1 1 2 2 11 7 5 1 2 2 6 7 2 2 7 2 7 6 9 14 2 15 2 15 15 8 8 8 6 2 6 2 8 93 93 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/ife.c Inter-FE action based on ForCES WG InterFE LFB * * Refer to: * draft-ietf-forces-interfelfb-03 * and * netdev01 paper: * "Distributing Linux Traffic Control Classifier-Action * Subsystem" * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai * * copyright Jamal Hadi Salim (2015) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_ife.h> #include <net/tc_act/tc_ife.h> #include <linux/etherdevice.h> #include <net/ife.h> #include <net/tc_wrapper.h> static int max_metacnt = IFE_META_MAX + 1; static struct tc_action_ops act_ife_ops; static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = { [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)}, [TCA_IFE_DMAC] = { .len = ETH_ALEN}, [TCA_IFE_SMAC] = { .len = ETH_ALEN}, [TCA_IFE_TYPE] = { .type = NLA_U16}, }; int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi) { u16 edata = 0; if (mi->metaval) edata = *(u16 *)mi->metaval; else if (metaval) edata = metaval; if (!edata) /* will not encode */ return 0; edata = htons(edata); return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata); } EXPORT_SYMBOL_GPL(ife_encode_meta_u16); int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval); else return nla_put(skb, mi->metaid, 0, NULL); } EXPORT_SYMBOL_GPL(ife_get_meta_u32); int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi) { if (metaval || mi->metaval) return 8; /* T+L+V == 2+2+4 */ return 0; } EXPORT_SYMBOL_GPL(ife_check_meta_u32); int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi) { if (metaval || mi->metaval) return 8; /* T+L+(V) == 2+2+(2+2bytepad) */ return 0; } EXPORT_SYMBOL_GPL(ife_check_meta_u16); int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi) { u32 edata = metaval; if (mi->metaval) edata = *(u32 *)mi->metaval; else if (metaval) edata = metaval; if (!edata) /* will not encode */ return 0; edata = htonl(edata); return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata); } EXPORT_SYMBOL_GPL(ife_encode_meta_u32); int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval); else return nla_put(skb, mi->metaid, 0, NULL); } EXPORT_SYMBOL_GPL(ife_get_meta_u16); int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp) { mi->metaval = kmemdup(metaval, sizeof(u32), gfp); if (!mi->metaval) return -ENOMEM; return 0; } EXPORT_SYMBOL_GPL(ife_alloc_meta_u32); int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp) { mi->metaval = kmemdup(metaval, sizeof(u16), gfp); if (!mi->metaval) return -ENOMEM; return 0; } EXPORT_SYMBOL_GPL(ife_alloc_meta_u16); void ife_release_meta_gen(struct tcf_meta_info *mi) { kfree(mi->metaval); } EXPORT_SYMBOL_GPL(ife_release_meta_gen); int ife_validate_meta_u32(void *val, int len) { if (len == sizeof(u32)) return 0; return -EINVAL; } EXPORT_SYMBOL_GPL(ife_validate_meta_u32); int ife_validate_meta_u16(void *val, int len) { /* length will not include padding */ if (len == sizeof(u16)) return 0; return -EINVAL; } EXPORT_SYMBOL_GPL(ife_validate_meta_u16); static LIST_HEAD(ifeoplist); static DEFINE_RWLOCK(ife_mod_lock); static struct tcf_meta_ops *find_ife_oplist(u16 metaid) { struct tcf_meta_ops *o; read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { if (o->metaid == metaid) { if (!try_module_get(o->owner)) o = NULL; read_unlock(&ife_mod_lock); return o; } } read_unlock(&ife_mod_lock); return NULL; } int register_ife_op(struct tcf_meta_ops *mops) { struct tcf_meta_ops *m; if (!mops->metaid || !mops->metatype || !mops->name || !mops->check_presence || !mops->encode || !mops->decode || !mops->get || !mops->alloc) return -EINVAL; write_lock(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid || (strcmp(mops->name, m->name) == 0)) { write_unlock(&ife_mod_lock); return -EEXIST; } } if (!mops->release) mops->release = ife_release_meta_gen; list_add_tail(&mops->list, &ifeoplist); write_unlock(&ife_mod_lock); return 0; } EXPORT_SYMBOL_GPL(unregister_ife_op); int unregister_ife_op(struct tcf_meta_ops *mops) { struct tcf_meta_ops *m; int err = -ENOENT; write_lock(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid) { list_del(&mops->list); err = 0; break; } } write_unlock(&ife_mod_lock); return err; } EXPORT_SYMBOL_GPL(register_ife_op); static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) { int ret = 0; /* XXX: unfortunately cant use nla_policy at this point * because a length of 0 is valid in the case of * "allow". "use" semantics do enforce for proper * length and i couldve use nla_policy but it makes it hard * to use it just for that.. */ if (ops->validate) return ops->validate(val, len); if (ops->metatype == NLA_U32) ret = ife_validate_meta_u32(val, len); else if (ops->metatype == NLA_U16) ret = ife_validate_meta_u16(val, len); return ret; } #ifdef CONFIG_MODULES static const char *ife_meta_id2name(u32 metaid) { switch (metaid) { case IFE_META_SKBMARK: return "skbmark"; case IFE_META_PRIO: return "skbprio"; case IFE_META_TCINDEX: return "tcindex"; default: return "unknown"; } } #endif /* called when adding new meta information */ static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held) { struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret = 0; if (!ops) { ret = -ENOENT; #ifdef CONFIG_MODULES if (rtnl_held) rtnl_unlock(); request_module("ife-meta-%s", ife_meta_id2name(metaid)); if (rtnl_held) rtnl_lock(); ops = find_ife_oplist(metaid); #endif } if (ops) { ret = 0; if (len) ret = ife_validate_metatype(ops, val, len); module_put(ops->owner); } return ret; } /* called when adding new meta information */ static int __add_metainfo(const struct tcf_meta_ops *ops, struct tcf_ife_info *ife, u32 metaid, void *metaval, int len, bool atomic, bool exists) { struct tcf_meta_info *mi = NULL; int ret = 0; mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL); if (!mi) return -ENOMEM; mi->metaid = metaid; mi->ops = ops; if (len > 0) { ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL); if (ret != 0) { kfree(mi); return ret; } } if (exists) spin_lock_bh(&ife->tcf_lock); list_add_tail(&mi->metalist, &ife->metalist); if (exists) spin_unlock_bh(&ife->tcf_lock); return ret; } static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops, struct tcf_ife_info *ife, u32 metaid, bool exists) { int ret; if (!try_module_get(ops->owner)) return -ENOENT; ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists); if (ret) module_put(ops->owner); return ret; } static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, int len, bool exists) { const struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret; if (!ops) return -ENOENT; ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists); if (ret) /*put back what find_ife_oplist took */ module_put(ops->owner); return ret; } static int use_all_metadata(struct tcf_ife_info *ife, bool exists) { struct tcf_meta_ops *o; int rc = 0; int installed = 0; read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists); if (rc == 0) installed += 1; } read_unlock(&ife_mod_lock); if (installed) return 0; else return -EINVAL; } static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) { struct tcf_meta_info *e; struct nlattr *nest; unsigned char *b = skb_tail_pointer(skb); int total_encoded = 0; /*can only happen on decode */ if (list_empty(&ife->metalist)) return 0; nest = nla_nest_start_noflag(skb, TCA_IFE_METALST); if (!nest) goto out_nlmsg_trim; list_for_each_entry(e, &ife->metalist, metalist) { if (!e->ops->get(skb, e)) total_encoded += 1; } if (!total_encoded) goto out_nlmsg_trim; nla_nest_end(skb, nest); return 0; out_nlmsg_trim: nlmsg_trim(skb, b); return -1; } /* under ife->tcf_lock */ static void _tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { list_del(&e->metalist); if (e->metaval) { if (e->ops->release) e->ops->release(e); else kfree(e->metaval); } module_put(e->ops->owner); kfree(e); } } static void tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a); spin_unlock_bh(&ife->tcf_lock); p = rcu_dereference_protected(ife->params, 1); if (p) kfree_rcu(p, rcu); } static int load_metalist(struct nlattr **tb, bool rtnl_held) { int i; for (i = 1; i < max_metacnt; i++) { if (tb[i]) { void *val = nla_data(tb[i]); int len = nla_len(tb[i]); int rc; rc = load_metaops_and_vet(i, val, len, rtnl_held); if (rc != 0) return rc; } } return 0; } static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, bool exists, bool rtnl_held) { int len = 0; int rc = 0; int i = 0; void *val; for (i = 1; i < max_metacnt; i++) { if (tb[i]) { val = nla_data(tb[i]); len = nla_len(tb[i]); rc = add_metainfo(ife, i, val, len, exists); if (rc) return rc; } } return rc; } static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_ife_params *p; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; u8 *daddr = NULL; u8 *saddr = NULL; bool exists = false; int ret = 0; u32 index; int err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "IFE requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy, NULL); if (err < 0) return err; if (!tb[TCA_IFE_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_IFE_PARMS]); /* IFE_DECODE is 0 and indicates the opposite of IFE_ENCODE because * they cannot run as the same time. Check on all other values which * are not supported right now. */ if (parm->flags & ~IFE_ENCODE) return -EINVAL; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; if (tb[TCA_IFE_METALST]) { err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], NULL, NULL); if (err) { kfree(p); return err; } err = load_metalist(tb2, !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) { kfree(p); return err; } } index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) { kfree(p); return err; } exists = err; if (exists && bind) { kfree(p); return ACT_P_BOUND; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_ife_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); kfree(p); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); kfree(p); return -EEXIST; } ife = to_ife(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&ife->metalist); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; p->flags = parm->flags; if (parm->flags & IFE_ENCODE) { if (tb[TCA_IFE_TYPE]) ife_type = nla_get_u16(tb[TCA_IFE_TYPE]); if (tb[TCA_IFE_DMAC]) daddr = nla_data(tb[TCA_IFE_DMAC]); if (tb[TCA_IFE_SMAC]) saddr = nla_data(tb[TCA_IFE_SMAC]); } if (parm->flags & IFE_ENCODE) { if (daddr) ether_addr_copy(p->eth_dst, daddr); else eth_zero_addr(p->eth_dst); if (saddr) ether_addr_copy(p->eth_src, saddr); else eth_zero_addr(p->eth_src); p->eth_type = ife_type; } if (tb[TCA_IFE_METALST]) { err = populate_metalist(ife, tb2, exists, !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) goto metadata_parse_err; } else { /* if no passed metadata allow list or passed allow-all * then here we process by adding as many supported metadatum * as we can. You better have at least one else we are * going to bail out */ err = use_all_metadata(ife, exists); if (err) goto metadata_parse_err; } if (exists) spin_lock_bh(&ife->tcf_lock); /* protected by tcf_lock when modifying existing action */ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(ife->params, p, 1); if (exists) spin_unlock_bh(&ife->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) kfree_rcu(p, rcu); return ret; metadata_parse_err: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: kfree(p); tcf_idr_release(*a, bind); return err; } static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; struct tc_ife opt = { .index = ife->tcf_index, .refcnt = refcount_read(&ife->tcf_refcnt) - ref, .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&ife->tcf_lock); opt.action = ife->tcf_action; p = rcu_dereference_protected(ife->params, lockdep_is_held(&ife->tcf_lock)); opt.flags = p->flags; if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &ife->tcf_tm); if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; if (!is_zero_ether_addr(p->eth_dst)) { if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, p->eth_dst)) goto nla_put_failure; } if (!is_zero_ether_addr(p->eth_src)) { if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, p->eth_src)) goto nla_put_failure; } if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; if (dump_metalist(skb, ife)) { /*ignore failure to dump metalist */ pr_info("Failed to dump metalist\n"); } spin_unlock_bh(&ife->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&ife->tcf_lock); nlmsg_trim(skb, b); return -1; } static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, u16 metaid, u16 mlen, void *mdata) { struct tcf_meta_info *e; /* XXX: use hash to speed up */ list_for_each_entry(e, &ife->metalist, metalist) { if (metaid == e->metaid) { if (e->ops) { /* We check for decode presence already */ return e->ops->decode(skb, mdata, mlen); } } } return -ENOENT; } static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; u8 *ifehdr_end; u8 *tlv_data; u16 metalen; bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); tlv_data = ife_decode(skb, &metalen); if (unlikely(!tlv_data)) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } ifehdr_end = tlv_data + metalen; for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) { u8 *curr_data; u16 mtype; u16 dlen; curr_data = ife_tlv_meta_decode(tlv_data, ifehdr_end, &mtype, &dlen, NULL); if (!curr_data) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) { /* abuse overlimits to count when we receive metadata * but dont have an ops for it */ pr_info_ratelimited("Unknown metaid %d dlen %d\n", mtype, dlen); qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); } } if (WARN_ON(tlv_data != ifehdr_end)) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skb->protocol = eth_type_trans(skb, skb->dev); skb_reset_network_header(skb); return action; } /*XXX: check if we can do this at install time instead of current * send data path **/ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) { struct tcf_meta_info *e, *n; int tot_run_sz = 0, run_sz = 0; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { if (e->ops->check_presence) { run_sz = e->ops->check_presence(skb, e); tot_run_sz += run_sz; } } return tot_run_sz; } static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res, struct tcf_ife_params *p) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ethhdr *oethh; /* outer ether header */ struct tcf_meta_info *e; /* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA where ORIGDATA = original ethernet header ... */ u16 metalen = ife_get_sz(skb, ife); int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; unsigned int skboff = 0; int new_len = skb->len + hdrm; bool exceed_mtu = false; void *ife_meta; int err = 0; if (!skb_at_tc_ingress(skb)) { if (new_len > skb->dev->mtu) exceed_mtu = true; } bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ /* abuse overlimits to count when we allow packet * with no metadata */ qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); return action; } /* could be stupid policy setup or mtu config * so lets be conservative.. */ if ((action == TC_ACT_SHOT) || exceed_mtu) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); ife_meta = ife_encode(skb, metalen); spin_lock(&ife->tcf_lock); /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... */ list_for_each_entry(e, &ife->metalist, metalist) { if (e->ops->encode) { err = e->ops->encode(skb, (void *)(ife_meta + skboff), e); } if (err < 0) { /* too corrupt to keep around if overwritten */ spin_unlock(&ife->tcf_lock); qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skboff += err; } spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; if (!is_zero_ether_addr(p->eth_src)) ether_addr_copy(oethh->h_source, p->eth_src); if (!is_zero_ether_addr(p->eth_dst)) ether_addr_copy(oethh->h_dest, p->eth_dst); oethh->h_proto = htons(p->eth_type); if (skb_at_tc_ingress(skb)) skb_pull(skb, skb->dev->hard_header_len); return action; } TC_INDIRECT_SCOPE int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; int ret; p = rcu_dereference_bh(ife->params); if (p->flags & IFE_ENCODE) { ret = tcf_ife_encode(skb, a, res, p); return ret; } return tcf_ife_decode(skb, a, res); } static struct tc_action_ops act_ife_ops = { .kind = "ife", .id = TCA_ID_IFE, .owner = THIS_MODULE, .act = tcf_ife_act, .dump = tcf_ife_dump, .cleanup = tcf_ife_cleanup, .init = tcf_ife_init, .size = sizeof(struct tcf_ife_info), }; MODULE_ALIAS_NET_ACT("ife"); static __net_init int ife_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id); return tc_action_net_init(net, tn, &act_ife_ops); } static void __net_exit ife_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_ife_ops.net_id); } static struct pernet_operations ife_net_ops = { .init = ife_init_net, .exit_batch = ife_exit_net, .id = &act_ife_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init ife_init_module(void) { return tcf_register_action(&act_ife_ops, &ife_net_ops); } static void __exit ife_cleanup_module(void) { tcf_unregister_action(&act_ife_ops, &ife_net_ops); } module_init(ife_init_module); module_exit(ife_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE LFB action"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ /* * PHY device list allow maintaining a list of PHY devices that are * part of a netdevice's link topology. PHYs can for example be chained, * as is the case when using a PHY that exposes an SFP module, on which an * SFP transceiver that embeds a PHY is connected. * * This list can then be used by userspace to leverage individual PHY * capabilities. */ #ifndef __PHY_LINK_TOPOLOGY_H #define __PHY_LINK_TOPOLOGY_H #include <linux/ethtool.h> #include <linux/netdevice.h> struct xarray; struct phy_device; struct sfp_bus; struct phy_link_topology { struct xarray phys; u32 next_phy_index; }; struct phy_device_node { enum phy_upstream upstream_type; union { struct net_device *netdev; struct phy_device *phydev; } upstream; struct sfp_bus *parent_sfp_bus; struct phy_device *phy; }; #if IS_ENABLED(CONFIG_PHYLIB) int phy_link_topo_add_phy(struct net_device *dev, struct phy_device *phy, enum phy_upstream upt, void *upstream); void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy); static inline struct phy_device * phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) { struct phy_link_topology *topo = dev->link_topo; struct phy_device_node *pdn; if (!topo) return NULL; pdn = xa_load(&topo->phys, phyindex); if (pdn) return pdn->phy; return NULL; } #else static inline int phy_link_topo_add_phy(struct net_device *dev, struct phy_device *phy, enum phy_upstream upt, void *upstream) { return 0; } static inline void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy) { } static inline struct phy_device * phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) { return NULL; } #endif #endif /* __PHY_LINK_TOPOLOGY_H */
1166 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM pagemap #if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGEMAP_H #include <linux/tracepoint.h> #include <linux/mm.h> #define PAGEMAP_MAPPED 0x0001u #define PAGEMAP_ANONYMOUS 0x0002u #define PAGEMAP_FILE 0x0004u #define PAGEMAP_SWAPCACHE 0x0008u #define PAGEMAP_SWAPBACKED 0x0010u #define PAGEMAP_MAPPEDDISK 0x0020u #define PAGEMAP_BUFFERS 0x0040u #define trace_pagemap_flags(folio) ( \ (folio_test_anon(folio) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ (folio_mapped(folio) ? PAGEMAP_MAPPED : 0) | \ (folio_test_swapcache(folio) ? PAGEMAP_SWAPCACHE : 0) | \ (folio_test_swapbacked(folio) ? PAGEMAP_SWAPBACKED : 0) | \ (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \ (folio_test_private(folio) ? PAGEMAP_BUFFERS : 0) \ ) TRACE_EVENT(mm_lru_insertion, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); __entry->lru = folio_lru_list(folio); __entry->flags = trace_pagemap_flags(folio); ), /* Flag format is based on page-types.c formatting for pagemap */ TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", __entry-&