5 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM block #if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BLOCK_H #include <linux/blktrace_api.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/tracepoint.h> #define RWBS_LEN 8 DECLARE_EVENT_CLASS(block_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh), TP_STRUCT__entry ( __field( dev_t, dev ) __field( sector_t, sector ) __field( size_t, size ) ), TP_fast_assign( __entry->dev = bh->b_bdev->bd_dev; __entry->sector = bh->b_blocknr; __entry->size = bh->b_size; ), TP_printk("%d,%d sector=%llu size=%zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->sector, __entry->size ) ); /** * block_touch_buffer - mark a buffer accessed * @bh: buffer_head being touched * * Called from touch_buffer(). */ DEFINE_EVENT(block_buffer, block_touch_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); /** * block_dirty_buffer - mark a buffer dirty * @bh: buffer_head being dirtied * * Called from mark_buffer_dirty(). */ DEFINE_EVENT(block_buffer, block_dirty_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); /** * block_rq_requeue - place block IO request back on a queue * @rq: block IO operation request * * The block operation request @rq is being placed back into queue * @q. For some reason the request was not completed and needs to be * put back in the queue. */ TRACE_EVENT(block_rq_requeue, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, 0) ); /** * block_rq_complete - block IO operation completed by device driver * @rq: block operations request * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_complete tracepoint event indicates that some portion * of operation request has been completed by the device driver. If * the @rq->bio is %NULL, then there is absolutely no additional work to * do for the request. If @rq->bio is non-NULL then there is * additional work required to complete the request. */ TRACE_EVENT(block_rq_complete, TP_PROTO(struct request *rq, int error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( int, error ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = error; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __entry->error) ); DECLARE_EVENT_CLASS(block_rq, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( unsigned int, bytes ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __entry->comm) ); /** * block_rq_insert - insert block operation request into queue * @rq: block IO operation request * * Called immediately before block operation request @rq is inserted * into queue @q. The fields in the operation request @rq struct can * be examined to determine which device and sectors the pending * operation would access. */ DEFINE_EVENT(block_rq, block_rq_insert, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_issue - issue pending block IO request operation to device driver * @rq: block IO operation request * * Called when block operation request @rq from queue @q is sent to a * device driver for processing. */ DEFINE_EVENT(block_rq, block_rq_issue, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_merge - merge request with another one in the elevator * @rq: block IO operation request * * Called when block operation request @rq from queue @q is merged to another * request queued in the elevator. */ DEFINE_EVENT(block_rq, block_rq_merge, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation * @bio: block operation completed * * This tracepoint indicates there is no further work to do on this * block IO operation @bio. */ TRACE_EVENT(block_bio_complete, TP_PROTO(struct request_queue *q, struct bio *bio), TP_ARGS(q, bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned, nr_sector ) __field( int, error ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = blk_status_to_errno(bio->bi_status); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->error) ); DECLARE_EVENT_CLASS(block_bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->comm) ); /** * block_bio_bounce - used bounce buffer when processing block operation * @bio: block operation * * A bounce buffer was used to handle the block operation @bio in @q. * This occurs when hardware limitations prevent a direct transfer of * data between the @bio data memory area and the IO device. Use of a * bounce buffer requires extra copying of data and decreases * performance. */ DEFINE_EVENT(block_bio, block_bio_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_backmerge - merging block operation to the end of an existing operation * @bio: new block operation to merge * * Merging block request @bio to the end of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_backmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_frontmerge - merging block operation to the beginning of an existing operation * @bio: new block operation to merge * * Merging block IO operation @bio to the beginning of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_frontmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_queue - putting new block IO operation in queue * @bio: new block operation * * About to place the block IO operation @bio into queue @q. */ DEFINE_EVENT(block_bio, block_bio_queue, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_getrq - get a free request entry in queue for block IO operations * @bio: pending block IO operation (can be %NULL) * * A request struct has been allocated to handle the block IO operation @bio. */ DEFINE_EVENT(block_bio, block_getrq, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_plug - keep operations requests in request queue * @q: request queue to plug * * Plug the request queue @q. Do not allow block operation requests * to be sent to the device driver. Instead, accumulate requests in * the queue to improve throughput performance of the block device. */ TRACE_EVENT(block_plug, TP_PROTO(struct request_queue *q), TP_ARGS(q), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s]", __entry->comm) ); DECLARE_EVENT_CLASS(block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit), TP_STRUCT__entry( __field( int, nr_rq ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->nr_rq = depth; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) ); /** * block_unplug - release of operations requests in request queue * @q: request queue to unplug * @depth: number of requests just added to the queue * @explicit: whether this was an explicit unplug, or one from schedule() * * Unplug request queue @q because device driver is scheduled to work * on elements in the request queue. */ DEFINE_EVENT(block_unplug, block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit) ); /** * block_split - split a single bio struct into two bio structs * @bio: block operation being split * @new_sector: The starting sector for the new bio * * The bio request @bio needs to be split into two bio requests. The newly * created @bio request starts at @new_sector. This split may be required due to * hardware limitations such as operation crossing device boundaries in a RAID * system. */ TRACE_EVENT(block_split, TP_PROTO(struct bio *bio, unsigned int new_sector), TP_ARGS(bio, new_sector), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( sector_t, new_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu / %llu [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, (unsigned long long)__entry->new_sector, __entry->comm) ); /** * block_bio_remap - map request for a logical device to the raw device * @bio: revised operation * @dev: original device for the operation * @from: original sector for the operation * * An operation for a logical device has been mapped to the * raw block device. */ TRACE_EVENT(block_bio_remap, TP_PROTO(struct bio *bio, dev_t dev, sector_t from), TP_ARGS(bio, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector) ); /** * block_rq_remap - map request for a block operation request * @rq: block IO operation request * @dev: device for the operation * @from: original sector for the operation * * The block operation request @rq in @q has been remapped. The block * operation request @rq holds the current information and @from hold * the original sector. */ TRACE_EVENT(block_rq_remap, TP_PROTO(struct request *rq, dev_t dev, sector_t from), TP_ARGS(rq, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __field( unsigned int, nr_bios ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = disk_devt(rq->rq_disk); __entry->sector = blk_rq_pos(rq); __entry->nr_sector = blk_rq_sectors(rq); __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector, __entry->nr_bios) ); #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
569 94 569 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_TIMEOUT_H #define _NF_CONNTRACK_TIMEOUT_H #include <net/net_namespace.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #define CTNL_TIMEOUT_NAME_MAX 32 struct nf_ct_timeout { __u16 l3num; const struct nf_conntrack_l4proto *l4proto; char data[]; }; struct ctnl_timeout { struct list_head head; struct rcu_head rcu_head; refcount_t refcnt; char name[CTNL_TIMEOUT_NAME_MAX]; struct nf_ct_timeout timeout; }; struct nf_conn_timeout { struct nf_ct_timeout __rcu *timeout; }; static inline unsigned int * nf_ct_timeout_data(const struct nf_conn_timeout *t) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_ct_timeout *timeout; timeout = rcu_dereference(t->timeout); if (timeout == NULL) return NULL; return (unsigned int *)timeout->data; #else return NULL; #endif } static inline struct nf_conn_timeout *nf_ct_timeout_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT return nf_ct_ext_find(ct, NF_CT_EXT_TIMEOUT); #else return NULL; #endif } static inline struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, struct nf_ct_timeout *timeout, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; timeout_ext = nf_ct_ext_add(ct, NF_CT_EXT_TIMEOUT, gfp); if (timeout_ext == NULL) return NULL; rcu_assign_pointer(timeout_ext->timeout, timeout); return timeout_ext; #else return NULL; #endif }; static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) { unsigned int *timeouts = NULL; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) timeouts = nf_ct_timeout_data(timeout_ext); #endif return timeouts; } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT int nf_conntrack_timeout_init(void); void nf_conntrack_timeout_fini(void); void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name); void nf_ct_destroy_timeout(struct nf_conn *ct); #else static inline int nf_conntrack_timeout_init(void) { return 0; } static inline void nf_conntrack_timeout_fini(void) { return; } static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name) { return -EOPNOTSUPP; } static inline void nf_ct_destroy_timeout(struct nf_conn *ct) { return; } #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT extern struct nf_ct_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name); extern void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout); #endif #endif /* _NF_CONNTRACK_TIMEOUT_H */ |
4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _LINUX_IO_URING_H #define _LINUX_IO_URING_H #include <linux/sched.h> #include <linux/xarray.h> #if defined(CONFIG_IO_URING) void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { if (current->io_uring) __io_uring_cancel(false); } static inline void io_uring_task_cancel(void) { if (current->io_uring) __io_uring_cancel(true); } static inline void io_uring_free(struct task_struct *tsk) { if (tsk->io_uring) __io_uring_free(tsk); } #else static inline void io_uring_task_cancel(void) { } static inline void io_uring_files_cancel(void) { } static inline void io_uring_free(struct task_struct *tsk) { } static inline bool io_is_uring_fops(struct file *file) { return false; } #endif #endif |
1199 1199 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_H #define _ASM_X86_PAGE_H #include <linux/types.h> #ifdef __KERNEL__ #include <asm/page_types.h> #ifdef CONFIG_X86_64 #include <asm/page_64.h> #else #include <asm/page_32.h> #endif /* CONFIG_X86_64 */ #ifndef __ASSEMBLY__ struct page; #include <linux/range.h> extern struct range pfn_mapped[]; extern int nr_pfn_mapped; static inline void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); } static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage) { copy_page(to, from); } #define alloc_zeroed_user_highpage_movable(vma, vaddr) \ alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) #endif #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ /* * We need __phys_reloc_hide() here because gcc may assume that there is no * overflow during __pa() calculation and can optimize it unexpectedly. * Newer versions of gcc provide -fno-strict-overflow switch to handle this * case properly. Once all supported versions of gcc understand it, we can * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) */ #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) #ifndef __va #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) /* * virt_to_page(kaddr) returns a valid pointer if and only if * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); } static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits) { return __canonical_address(vaddr, vaddr_bits) == vaddr; } #endif /* __ASSEMBLY__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ #endif /* _ASM_X86_PAGE_H */ |
63 67 67 13 4 4 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/seq_file.c * * helper functions for making synthetic files from sequences of records. * initial implementation -- AV, Oct 2001. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cache.h> #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/mm.h> #include <linux/printk.h> #include <linux/string_helpers.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <asm/page.h> static struct kmem_cache *seq_file_cache __ro_after_init; static void seq_set_overflow(struct seq_file *m) { m->count = m->size; } static void *seq_buf_alloc(unsigned long size) { if (unlikely(size > MAX_RW_COUNT)) return NULL; return kvmalloc(size, GFP_KERNEL_ACCOUNT); } /** * seq_open - initialize sequential file * @file: file we initialize * @op: method table describing the sequence * * seq_open() sets @file, associating it with a sequence described * by @op. @op->start() sets the iterator up and returns the first * element of sequence. @op->stop() shuts it down. @op->next() * returns the next element of sequence. @op->show() prints element * into the buffer. In case of error ->start() and ->next() return * ERR_PTR(error). In the end of sequence they return %NULL. ->show() * returns 0 in case of success and negative number in case of error. * Returning SEQ_SKIP means "discard this element and move on". * Note: seq_open() will allocate a struct seq_file and store its * pointer in @file->private_data. This pointer should not be modified. */ int seq_open(struct file *file, const struct seq_operations *op) { struct seq_file *p; WARN_ON(file->private_data); p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL); if (!p) return -ENOMEM; file->private_data = p; mutex_init(&p->lock); p->op = op; // No refcounting: the lifetime of 'p' is constrained // to the lifetime of the file. p->file = file; /* * seq_files support lseek() and pread(). They do not implement * write() at all, but we clear FMODE_PWRITE here for historical * reasons. * * If a client of seq_files a) implements file.write() and b) wishes to * support pwrite() then that client will need to implement its own * file.open() which calls seq_open() and then sets FMODE_PWRITE. */ file->f_mode &= ~FMODE_PWRITE; return 0; } EXPORT_SYMBOL(seq_open); static int traverse(struct seq_file *m, loff_t offset) { loff_t pos = 0; int error = 0; void *p; m->index = 0; m->count = m->from = 0; if (!offset) return 0; if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } p = m->op->start(m, &m->index); while (p) { error = PTR_ERR(p); if (IS_ERR(p)) break; error = m->op->show(m, p); if (error < 0) break; if (unlikely(error)) { error = 0; m->count = 0; } if (seq_has_overflowed(m)) goto Eoverflow; p = m->op->next(m, p, &m->index); if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; break; } pos += m->count; m->count = 0; if (pos == offset) break; } m->op->stop(m, p); return error; Eoverflow: m->op->stop(m, p); kvfree(m->buf); m->count = 0; m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } /** * seq_read - ->read() method for sequential files. * @file: the file to read from * @buf: the buffer to read to * @size: the maximum number of bytes to read * @ppos: the current position in the file * * Ready-made ->f_op->read() */ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = size}; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, file); iov_iter_init(&iter, READ, &iov, 1, size); kiocb.ki_pos = *ppos; ret = seq_read_iter(&kiocb, &iter); *ppos = kiocb.ki_pos; return ret; } EXPORT_SYMBOL(seq_read); /* * Ready-made ->f_op->read_iter() */ ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct seq_file *m = iocb->ki_filp->private_data; size_t copied = 0; size_t n; void *p; int err = 0; if (!iov_iter_count(iter)) return 0; mutex_lock(&m->lock); /* * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ if (iocb->ki_pos == 0) { m->index = 0; m->count = 0; } /* Don't assume ki_pos is where we left it */ if (unlikely(iocb->ki_pos != m->read_pos)) { while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN) ; if (err) { /* With prejudice... */ m->read_pos = 0; m->index = 0; m->count = 0; goto Done; } else { m->read_pos = iocb->ki_pos; } } /* grab buffer if we didn't have one */ if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } // something left in the buffer - copy it out first if (m->count) { n = copy_to_iter(m->buf + m->from, m->count, iter); m->count -= n; m->from += n; copied += n; if (m->count) // hadn't managed to copy everything goto Done; } // get a non-empty record in the buffer m->from = 0; p = m->op->start(m, &m->index); while (1) { err = PTR_ERR(p); if (!p || IS_ERR(p)) // EOF or an error break; err = m->op->show(m, p); if (err < 0) // hard error break; if (unlikely(err)) // ->show() says "skip it" m->count = 0; if (unlikely(!m->count)) { // empty record p = m->op->next(m, p, &m->index); continue; } if (!seq_has_overflowed(m)) // got it goto Fill; // need a bigger buffer m->op->stop(m, p); kvfree(m->buf); m->count = 0; m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; p = m->op->start(m, &m->index); } // EOF or an error m->op->stop(m, p); m->count = 0; goto Done; Fill: // one non-empty record is in the buffer; if they want more, // try to fit more in, but in any case we need to advance // the iterator once for every record shown. while (1) { size_t offs = m->count; loff_t pos = m->index; p = m->op->next(m, p, &m->index); if (pos == m->index) { pr_info_ratelimited("buggy .next function %ps did not update position index\n", m->op->next); m->index++; } if (!p || IS_ERR(p)) // no next record for us break; if (m->count >= iov_iter_count(iter)) break; err = m->op->show(m, p); if (err > 0) { // ->show() says "skip it" m->count = offs; } else if (err || seq_has_overflowed(m)) { m->count = offs; break; } } m->op->stop(m, p); n = copy_to_iter(m->buf, m->count, iter); copied += n; m->count -= n; m->from = n; Done: if (unlikely(!copied)) { copied = m->count ? -EFAULT : err; } else { iocb->ki_pos += copied; m->read_pos += copied; } mutex_unlock(&m->lock); return copied; Enomem: err = -ENOMEM; goto Done; } EXPORT_SYMBOL(seq_read_iter); /** * seq_lseek - ->llseek() method for sequential files. * @file: the file in question * @offset: new position * @whence: 0 for absolute, 1 for relative position * * Ready-made ->f_op->llseek() */ loff_t seq_lseek(struct file *file, loff_t offset, int whence) { struct seq_file *m = file->private_data; loff_t retval = -EINVAL; mutex_lock(&m->lock); switch (whence) { case SEEK_CUR: offset += file->f_pos; fallthrough; case SEEK_SET: if (offset < 0) break; retval = offset; if (offset != m->read_pos) { while ((retval = traverse(m, offset)) == -EAGAIN) ; if (retval) { /* with extreme prejudice... */ file->f_pos = 0; m->read_pos = 0; m->index = 0; m->count = 0; } else { m->read_pos = offset; retval = file->f_pos = offset; } } else { file->f_pos = offset; } } mutex_unlock(&m->lock); return retval; } EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. * @file: file in question * @inode: its inode * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. */ int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; kvfree(m->buf); kmem_cache_free(seq_file_cache, m); return 0; } EXPORT_SYMBOL(seq_release); /** * seq_escape_mem - print data into buffer, escaping some characters * @m: target buffer * @src: source buffer * @len: size of source buffer * @flags: flags to pass to string_escape_mem() * @esc: set of characters that need escaping * * Puts data into buffer, replacing each occurrence of character from * given class (defined by @flags and @esc) with printable escaped sequence. * * Use seq_has_overflowed() to check for errors. */ void seq_escape_mem(struct seq_file *m, const char *src, size_t len, unsigned int flags, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int ret; ret = string_escape_mem(src, len, buf, size, flags, esc); seq_commit(m, ret < size ? ret : -1); } EXPORT_SYMBOL(seq_escape_mem); /** * seq_escape - print string into buffer, escaping some characters * @m: target buffer * @s: string * @esc: set of characters that need escaping * * Puts string into buffer, replacing each occurrence of character from * @esc with usual octal escape. * Use seq_has_overflowed() to check for errors. */ void seq_escape(struct seq_file *m, const char *s, const char *esc) { seq_escape_str(m, s, ESCAPE_OCTAL, esc); } EXPORT_SYMBOL(seq_escape); void seq_vprintf(struct seq_file *m, const char *f, va_list args) { int len; if (m->count < m->size) { len = vsnprintf(m->buf + m->count, m->size - m->count, f, args); if (m->count + len < m->size) { m->count += len; return; } } seq_set_overflow(m); } EXPORT_SYMBOL(seq_vprintf); void seq_printf(struct seq_file *m, const char *f, ...) { va_list args; va_start(args, f); seq_vprintf(m, f, args); va_end(args); } EXPORT_SYMBOL(seq_printf); #ifdef CONFIG_BINARY_PRINTF void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary) { int len; if (m->count < m->size) { len = bstr_printf(m->buf + m->count, m->size - m->count, f, binary); if (m->count + len < m->size) { m->count += len; return; } } seq_set_overflow(m); } EXPORT_SYMBOL(seq_bprintf); #endif /* CONFIG_BINARY_PRINTF */ /** * mangle_path - mangle and copy path to buffer beginning * @s: buffer start * @p: beginning of path in above buffer * @esc: set of characters that need escaping * * Copy the path from @p to @s, replacing each occurrence of character from * @esc with usual octal escape. * Returns pointer past last written character in @s, or NULL in case of * failure. */ char *mangle_path(char *s, const char *p, const char *esc) { while (s <= p) { char c = *p++; if (!c) { return s; } else if (!strchr(esc, c)) { *s++ = c; } else if (s + 4 > p) { break; } else { *s++ = '\\'; *s++ = '0' + ((c & 0300) >> 6); *s++ = '0' + ((c & 070) >> 3); *s++ = '0' + (c & 07); } } return NULL; } EXPORT_SYMBOL(mangle_path); /** * seq_path - seq_file interface to print a pathname * @m: the seq_file handle * @path: the struct path to print * @esc: set of characters to escape in the output * * return the absolute path of 'path', as represented by the * dentry / mnt pair in the path parameter. */ int seq_path(struct seq_file *m, const struct path *path, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -1; if (size) { char *p = d_path(path, buf, size); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; } } seq_commit(m, res); return res; } EXPORT_SYMBOL(seq_path); /** * seq_file_path - seq_file interface to print a pathname of a file * @m: the seq_file handle * @file: the struct file to print * @esc: set of characters to escape in the output * * return the absolute path to the file. */ int seq_file_path(struct seq_file *m, struct file *file, const char *esc) { return seq_path(m, &file->f_path, esc); } EXPORT_SYMBOL(seq_file_path); /* * Same as seq_path, but relative to supplied root. */ int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -ENAMETOOLONG; if (size) { char *p; p = __d_path(path, root, buf, size); if (!p) return SEQ_SKIP; res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; else res = -ENAMETOOLONG; } } seq_commit(m, res); return res < 0 && res != -ENAMETOOLONG ? res : 0; } /* * returns the path of the 'dentry' from the root of its filesystem. */ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -1; if (size) { char *p = dentry_path(dentry, buf, size); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; } } seq_commit(m, res); return res; } EXPORT_SYMBOL(seq_dentry); static void *single_start(struct seq_file *p, loff_t *pos) { return NULL + (*pos == 0); } static void *single_next(struct seq_file *p, void *v, loff_t *pos) { ++*pos; return NULL; } static void single_stop(struct seq_file *p, void *v) { } int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT); int res = -ENOMEM; if (op) { op->start = single_start; op->next = single_next; op->stop = single_stop; op->show = show; res = seq_open(file, op); if (!res) ((struct seq_file *)file->private_data)->private = data; else kfree(op); } return res; } EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; ((struct seq_file *)file->private_data)->size = size; return 0; } EXPORT_SYMBOL(single_open_size); int single_release(struct inode *inode, struct file *file) { const struct seq_operations *op = ((struct seq_file *)file->private_data)->op; int res = seq_release(inode, file); kfree(op); return res; } EXPORT_SYMBOL(single_release); int seq_release_private(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; kfree(seq->private); seq->private = NULL; return seq_release(inode, file); } EXPORT_SYMBOL(seq_release_private); void *__seq_open_private(struct file *f, const struct seq_operations *ops, int psize) { int rc; void *private; struct seq_file *seq; private = kzalloc(psize, GFP_KERNEL_ACCOUNT); if (private == NULL) goto out; rc = seq_open(f, ops); if (rc < 0) goto out_free; seq = f->private_data; seq->private = private; return private; out_free: kfree(private); out: return NULL; } EXPORT_SYMBOL(__seq_open_private); int seq_open_private(struct file *filp, const struct seq_operations *ops, int psize) { return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM; } EXPORT_SYMBOL(seq_open_private); void seq_putc(struct seq_file *m, char c) { if (m->count >= m->size) return; m->buf[m->count++] = c; } EXPORT_SYMBOL(seq_putc); void seq_puts(struct seq_file *m, const char *s) { int len = strlen(s); if (m->count + len >= m->size) { seq_set_overflow(m); return; } memcpy(m->buf + m->count, s, len); m->count += len; } EXPORT_SYMBOL(seq_puts); /** * seq_put_decimal_ull_width - A helper routine for putting decimal numbers * without rich format of printf(). * only 'unsigned long long' is supported. * @m: seq_file identifying the buffer to which data should be written * @delimiter: a string which is printed before the number * @num: the number * @width: a minimum field width * * This routine will put strlen(delimiter) + number into seq_filed. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, unsigned long long num, unsigned int width) { int len; if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } if (!width) width = 1; if (m->count + width >= m->size) goto overflow; len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; m->count += len; return; overflow: seq_set_overflow(m); } void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num) { return seq_put_decimal_ull_width(m, delimiter, num, 0); } EXPORT_SYMBOL(seq_put_decimal_ull); /** * seq_put_hex_ll - put a number in hexadecimal notation * @m: seq_file identifying the buffer to which data should be written * @delimiter: a string which is printed before the number * @v: the number * @width: a minimum field width * * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v) * * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ void seq_put_hex_ll(struct seq_file *m, const char *delimiter, unsigned long long v, unsigned int width) { unsigned int len; int i; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } /* If x is 0, the result of __builtin_clzll is undefined */ if (v == 0) len = 1; else len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4; if (len < width) len = width; if (m->count + len > m->size) { seq_set_overflow(m); return; } for (i = len - 1; i >= 0; i--) { m->buf[m->count + i] = hex_asc[0xf & v]; v = v >> 4; } m->count += len; } void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { int len; if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ goto overflow; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } if (m->count + 2 >= m->size) goto overflow; if (num < 0) { m->buf[m->count++] = '-'; num = -num; } if (num < 10) { m->buf[m->count++] = num + '0'; return; } len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; m->count += len; return; overflow: seq_set_overflow(m); } EXPORT_SYMBOL(seq_put_decimal_ll); /** * seq_write - write arbitrary data to buffer * @seq: seq_file identifying the buffer to which data should be written * @data: data address * @len: number of bytes * * Return 0 on success, non-zero otherwise. */ int seq_write(struct seq_file *seq, const void *data, size_t len) { if (seq->count + len < seq->size) { memcpy(seq->buf + seq->count, data, len); seq->count += len; return 0; } seq_set_overflow(seq); return -1; } EXPORT_SYMBOL(seq_write); /** * seq_pad - write padding spaces to buffer * @m: seq_file identifying the buffer to which data should be written * @c: the byte to append after padding if non-zero */ void seq_pad(struct seq_file *m, char c) { int size = m->pad_until - m->count; if (size > 0) { if (size + m->count > m->size) { seq_set_overflow(m); return; } memset(m->buf + m->count, ' ', size); m->count += size; } if (c) seq_putc(m, c); } EXPORT_SYMBOL(seq_pad); /* A complete analogue of print_hex_dump() */ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; char *buffer; size_t size; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; switch (prefix_type) { case DUMP_PREFIX_ADDRESS: seq_printf(m, "%s%p: ", prefix_str, ptr + i); break; case DUMP_PREFIX_OFFSET: seq_printf(m, "%s%.8x: ", prefix_str, i); break; default: seq_printf(m, "%s", prefix_str); break; } size = seq_get_buf(m, &buffer); ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, buffer, size, ascii); seq_commit(m, ret < size ? ret : -1); seq_putc(m, '\n'); } } EXPORT_SYMBOL(seq_hex_dump); struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; list_for_each(lh, head) if (pos-- == 0) return lh; return NULL; } EXPORT_SYMBOL(seq_list_start); struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) { if (!pos) return head; return seq_list_start(head, pos - 1); } EXPORT_SYMBOL(seq_list_start_head); struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) { struct list_head *lh; lh = ((struct list_head *)v)->next; ++*ppos; return lh == head ? NULL : lh; } EXPORT_SYMBOL(seq_list_next); struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos) { struct list_head *lh; list_for_each_rcu(lh, head) if (pos-- == 0) return lh; return NULL; } EXPORT_SYMBOL(seq_list_start_rcu); struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos) { if (!pos) return head; return seq_list_start_rcu(head, pos - 1); } EXPORT_SYMBOL(seq_list_start_head_rcu); struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos) { struct list_head *lh; lh = list_next_rcu((struct list_head *)v); ++*ppos; return lh == head ? NULL : lh; } EXPORT_SYMBOL(seq_list_next_rcu); /** * seq_hlist_start - start an iteration of a hlist * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). */ struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos) { struct hlist_node *node; hlist_for_each(node, head) if (pos-- == 0) return node; return NULL; } EXPORT_SYMBOL(seq_hlist_start); /** * seq_hlist_start_head - start an iteration of a hlist * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). Call this function if you want to * print a header at the top of the output. */ struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos) { if (!pos) return SEQ_START_TOKEN; return seq_hlist_start(head, pos - 1); } EXPORT_SYMBOL(seq_hlist_start_head); /** * seq_hlist_next - move to the next position of the hlist * @v: the current iterator * @head: the head of the hlist * @ppos: the current position * * Called at seq_file->op->next(). */ struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, loff_t *ppos) { struct hlist_node *node = v; ++*ppos; if (v == SEQ_START_TOKEN) return head->first; else return node->next; } EXPORT_SYMBOL(seq_hlist_next); /** * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, loff_t pos) { struct hlist_node *node; __hlist_for_each_rcu(node, head) if (pos-- == 0) return node; return NULL; } EXPORT_SYMBOL(seq_hlist_start_rcu); /** * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). Call this function if you want to * print a header at the top of the output. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, loff_t pos) { if (!pos) return SEQ_START_TOKEN; return seq_hlist_start_rcu(head, pos - 1); } EXPORT_SYMBOL(seq_hlist_start_head_rcu); /** * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU * @v: the current iterator * @head: the head of the hlist * @ppos: the current position * * Called at seq_file->op->next(). * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_next_rcu(void *v, struct hlist_head *head, loff_t *ppos) { struct hlist_node *node = v; ++*ppos; if (v == SEQ_START_TOKEN) return rcu_dereference(head->first); else return rcu_dereference(node->next); } EXPORT_SYMBOL(seq_hlist_next_rcu); /** * seq_hlist_start_percpu - start an iteration of a percpu hlist array * @head: pointer to percpu array of struct hlist_heads * @cpu: pointer to cpu "cursor" * @pos: start position of sequence * * Called at seq_file->op->start(). */ struct hlist_node * seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos) { struct hlist_node *node; for_each_possible_cpu(*cpu) { hlist_for_each(node, per_cpu_ptr(head, *cpu)) { if (pos-- == 0) return node; } } return NULL; } EXPORT_SYMBOL(seq_hlist_start_percpu); /** * seq_hlist_next_percpu - move to the next position of the percpu hlist array * @v: pointer to current hlist_node * @head: pointer to percpu array of struct hlist_heads * @cpu: pointer to cpu "cursor" * @pos: start position of sequence * * Called at seq_file->op->next(). */ struct hlist_node * seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos) { struct hlist_node *node = v; ++*pos; if (node->next) return node->next; for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids; *cpu = cpumask_next(*cpu, cpu_possible_mask)) { struct hlist_head *bucket = per_cpu_ptr(head, *cpu); if (!hlist_empty(bucket)) return bucket->first; } return NULL; } EXPORT_SYMBOL(seq_hlist_next_percpu); void __init seq_file_init(void) { seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC); } |
40 33 1 4 8 55 55 55 55 55 55 20 20 20 20 10 10 10 4 4 33 33 33 32 33 33 40 40 40 1 45 45 44 1 1 6 6 35 35 4 4 33 34 34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH * Copyright (C) 2018 - 2019, 2021 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS #define __MAC80211_DRIVER_OPS #include <net/mac80211.h> #include "ieee80211_i.h" #include "trace.h" #define check_sdata_in_driver(sdata) ({ \ !WARN_ONCE(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER), \ "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", \ sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); \ }) static inline struct ieee80211_sub_if_data * get_bss_sdata(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); return sdata; } static inline void drv_tx(struct ieee80211_local *local, struct ieee80211_tx_control *control, struct sk_buff *skb) { local->ops->tx(&local->hw, control, skb); } static inline void drv_sync_rx_queues(struct ieee80211_local *local, struct sta_info *sta) { if (local->ops->sync_rx_queues) { trace_drv_sync_rx_queues(local, sta->sdata, &sta->sta); local->ops->sync_rx_queues(&local->hw); trace_drv_return_void(local); } } static inline void drv_get_et_strings(struct ieee80211_sub_if_data *sdata, u32 sset, u8 *data) { struct ieee80211_local *local = sdata->local; if (local->ops->get_et_strings) { trace_drv_get_et_strings(local, sset); local->ops->get_et_strings(&local->hw, &sdata->vif, sset, data); trace_drv_return_void(local); } } static inline void drv_get_et_stats(struct ieee80211_sub_if_data *sdata, struct ethtool_stats *stats, u64 *data) { struct ieee80211_local *local = sdata->local; if (local->ops->get_et_stats) { trace_drv_get_et_stats(local); local->ops->get_et_stats(&local->hw, &sdata->vif, stats, data); trace_drv_return_void(local); } } static inline int drv_get_et_sset_count(struct ieee80211_sub_if_data *sdata, int sset) { struct ieee80211_local *local = sdata->local; int rv = 0; if (local->ops->get_et_sset_count) { trace_drv_get_et_sset_count(local, sset); rv = local->ops->get_et_sset_count(&local->hw, &sdata->vif, sset); trace_drv_return_int(local, rv); } return rv; } int drv_start(struct ieee80211_local *local); void drv_stop(struct ieee80211_local *local); #ifdef CONFIG_PM static inline int drv_suspend(struct ieee80211_local *local, struct cfg80211_wowlan *wowlan) { int ret; might_sleep(); trace_drv_suspend(local); ret = local->ops->suspend(&local->hw, wowlan); trace_drv_return_int(local, ret); return ret; } static inline int drv_resume(struct ieee80211_local *local) { int ret; might_sleep(); trace_drv_resume(local); ret = local->ops->resume(&local->hw); trace_drv_return_int(local, ret); return ret; } static inline void drv_set_wakeup(struct ieee80211_local *local, bool enabled) { might_sleep(); if (!local->ops->set_wakeup) return; trace_drv_set_wakeup(local, enabled); local->ops->set_wakeup(&local->hw, enabled); trace_drv_return_void(local); } #endif int drv_add_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); int drv_change_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type, bool p2p); void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); static inline int drv_config(struct ieee80211_local *local, u32 changed) { int ret; might_sleep(); trace_drv_config(local, changed); ret = local->ops->config(&local->hw, changed); trace_drv_return_int(local, ret); return ret; } static inline void drv_bss_info_changed(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *info, u32 changed) { might_sleep(); if (WARN_ON_ONCE(changed & (BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED) && sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT && sdata->vif.type != NL80211_IFTYPE_OCB)) return; if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !sdata->vif.mu_mimo_owner && !(changed & BSS_CHANGED_TXPOWER)))) return; if (!check_sdata_in_driver(sdata)) return; trace_drv_bss_info_changed(local, sdata, info, changed); if (local->ops->bss_info_changed) local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed); trace_drv_return_void(local); } static inline u64 drv_prepare_multicast(struct ieee80211_local *local, struct netdev_hw_addr_list *mc_list) { u64 ret = 0; trace_drv_prepare_multicast(local, mc_list->count); if (local->ops->prepare_multicast) ret = local->ops->prepare_multicast(&local->hw, mc_list); trace_drv_return_u64(local, ret); return ret; } static inline void drv_configure_filter(struct ieee80211_local *local, unsigned int changed_flags, unsigned int *total_flags, u64 multicast) { might_sleep(); trace_drv_configure_filter(local, changed_flags, total_flags, multicast); local->ops->configure_filter(&local->hw, changed_flags, total_flags, multicast); trace_drv_return_void(local); } static inline void drv_config_iface_filter(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int filter_flags, unsigned int changed_flags) { might_sleep(); trace_drv_config_iface_filter(local, sdata, filter_flags, changed_flags); if (local->ops->config_iface_filter) local->ops->config_iface_filter(&local->hw, &sdata->vif, filter_flags, changed_flags); trace_drv_return_void(local); } static inline int drv_set_tim(struct ieee80211_local *local, struct ieee80211_sta *sta, bool set) { int ret = 0; trace_drv_set_tim(local, sta, set); if (local->ops->set_tim) ret = local->ops->set_tim(&local->hw, sta, set); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_key(struct ieee80211_local *local, enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_key_conf *key) { int ret; might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_set_key(local, cmd, sdata, sta, key); ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key); trace_drv_return_int(local, ret); return ret; } static inline void drv_update_tkip_key(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_key_conf *conf, struct sta_info *sta, u32 iv32, u16 *phase1key) { struct ieee80211_sta *ista = NULL; if (sta) ista = &sta->sta; sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_update_tkip_key(local, sdata, conf, ista, iv32); if (local->ops->update_tkip_key) local->ops->update_tkip_key(&local->hw, &sdata->vif, conf, ista, iv32, phase1key); trace_drv_return_void(local); } static inline int drv_hw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_scan_request *req) { int ret; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_hw_scan(local, sdata); ret = local->ops->hw_scan(&local->hw, &sdata->vif, req); trace_drv_return_int(local, ret); return ret; } static inline void drv_cancel_hw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_cancel_hw_scan(local, sdata); local->ops->cancel_hw_scan(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_sched_scan_start(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req, struct ieee80211_scan_ies *ies) { int ret; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sched_scan_start(local, sdata); ret = local->ops->sched_scan_start(&local->hw, &sdata->vif, req, ies); trace_drv_return_int(local, ret); return ret; } static inline int drv_sched_scan_stop(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sched_scan_stop(local, sdata); ret = local->ops->sched_scan_stop(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_sw_scan_start(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const u8 *mac_addr) { might_sleep(); trace_drv_sw_scan_start(local, sdata, mac_addr); if (local->ops->sw_scan_start) local->ops->sw_scan_start(&local->hw, &sdata->vif, mac_addr); trace_drv_return_void(local); } static inline void drv_sw_scan_complete(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); trace_drv_sw_scan_complete(local, sdata); if (local->ops->sw_scan_complete) local->ops->sw_scan_complete(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_get_stats(struct ieee80211_local *local, struct ieee80211_low_level_stats *stats) { int ret = -EOPNOTSUPP; might_sleep(); if (local->ops->get_stats) ret = local->ops->get_stats(&local->hw, stats); trace_drv_get_stats(local, stats, ret); return ret; } static inline void drv_get_key_seq(struct ieee80211_local *local, struct ieee80211_key *key, struct ieee80211_key_seq *seq) { if (local->ops->get_key_seq) local->ops->get_key_seq(&local->hw, &key->conf, seq); trace_drv_get_key_seq(local, &key->conf); } static inline int drv_set_frag_threshold(struct ieee80211_local *local, u32 value) { int ret = 0; might_sleep(); trace_drv_set_frag_threshold(local, value); if (local->ops->set_frag_threshold) ret = local->ops->set_frag_threshold(&local->hw, value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_rts_threshold(struct ieee80211_local *local, u32 value) { int ret = 0; might_sleep(); trace_drv_set_rts_threshold(local, value); if (local->ops->set_rts_threshold) ret = local->ops->set_rts_threshold(&local->hw, value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_coverage_class(struct ieee80211_local *local, s16 value) { int ret = 0; might_sleep(); trace_drv_set_coverage_class(local, value); if (local->ops->set_coverage_class) local->ops->set_coverage_class(&local->hw, value); else ret = -EOPNOTSUPP; trace_drv_return_int(local, ret); return ret; } static inline void drv_sta_notify(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum sta_notify_cmd cmd, struct ieee80211_sta *sta) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_notify(local, sdata, cmd, sta); if (local->ops->sta_notify) local->ops->sta_notify(&local->hw, &sdata->vif, cmd, sta); trace_drv_return_void(local); } static inline int drv_sta_add(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { int ret = 0; might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_add(local, sdata, sta); if (local->ops->sta_add) ret = local->ops->sta_add(&local->hw, &sdata->vif, sta); trace_drv_return_int(local, ret); return ret; } static inline void drv_sta_remove(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_remove(local, sdata, sta); if (local->ops->sta_remove) local->ops->sta_remove(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } #ifdef CONFIG_MAC80211_DEBUGFS static inline void drv_sta_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct dentry *dir) { might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; if (local->ops->sta_add_debugfs) local->ops->sta_add_debugfs(&local->hw, &sdata->vif, sta, dir); } #endif static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_pre_rcu_remove(local, sdata, &sta->sta); if (local->ops->sta_pre_rcu_remove) local->ops->sta_pre_rcu_remove(&local->hw, &sdata->vif, &sta->sta); trace_drv_return_void(local); } __must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state); __must_check int drv_sta_set_txpwr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta); void drv_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u32 changed); static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_rate_tbl_update(local, sdata, sta); if (local->ops->sta_rate_tbl_update) local->ops->sta_rate_tbl_update(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } static inline void drv_sta_statistics(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct station_info *sinfo) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_statistics(local, sdata, sta); if (local->ops->sta_statistics) local->ops->sta_statistics(&local->hw, &sdata->vif, sta, sinfo); trace_drv_return_void(local); } int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 ac, const struct ieee80211_tx_queue_params *params); u64 drv_get_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void drv_set_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf); void drv_offset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, s64 offset); void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); static inline int drv_tx_last_beacon(struct ieee80211_local *local) { int ret = 0; /* default unsupported op for less congestion */ might_sleep(); trace_drv_tx_last_beacon(local); if (local->ops->tx_last_beacon) ret = local->ops->tx_last_beacon(&local->hw); trace_drv_return_int(local, ret); return ret; } int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_ampdu_params *params); static inline int drv_get_survey(struct ieee80211_local *local, int idx, struct survey_info *survey) { int ret = -EOPNOTSUPP; trace_drv_get_survey(local, idx, survey); if (local->ops->get_survey) ret = local->ops->get_survey(&local->hw, idx, survey); trace_drv_return_int(local, ret); return ret; } static inline void drv_rfkill_poll(struct ieee80211_local *local) { might_sleep(); if (local->ops->rfkill_poll) local->ops->rfkill_poll(&local->hw); } static inline void drv_flush(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u32 queues, bool drop) { struct ieee80211_vif *vif = sdata ? &sdata->vif : NULL; might_sleep(); if (sdata && !check_sdata_in_driver(sdata)) return; trace_drv_flush(local, queues, drop); if (local->ops->flush) local->ops->flush(&local->hw, vif, queues, drop); trace_drv_return_void(local); } static inline void drv_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { might_sleep(); trace_drv_channel_switch(local, sdata, ch_switch); local->ops->channel_switch(&local->hw, &sdata->vif, ch_switch); trace_drv_return_void(local); } static inline int drv_set_antenna(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); if (local->ops->set_antenna) ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant); trace_drv_set_antenna(local, tx_ant, rx_ant, ret); return ret; } static inline int drv_get_antenna(struct ieee80211_local *local, u32 *tx_ant, u32 *rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); if (local->ops->get_antenna) ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant); trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret); return ret; } static inline int drv_remain_on_channel(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, unsigned int duration, enum ieee80211_roc_type type) { int ret; might_sleep(); trace_drv_remain_on_channel(local, sdata, chan, duration, type); ret = local->ops->remain_on_channel(&local->hw, &sdata->vif, chan, duration, type); trace_drv_return_int(local, ret); return ret; } static inline int drv_cancel_remain_on_channel(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); trace_drv_cancel_remain_on_channel(local, sdata); ret = local->ops->cancel_remain_on_channel(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_ringparam(struct ieee80211_local *local, u32 tx, u32 rx) { int ret = -ENOTSUPP; might_sleep(); trace_drv_set_ringparam(local, tx, rx); if (local->ops->set_ringparam) ret = local->ops->set_ringparam(&local->hw, tx, rx); trace_drv_return_int(local, ret); return ret; } static inline void drv_get_ringparam(struct ieee80211_local *local, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max) { might_sleep(); trace_drv_get_ringparam(local, tx, tx_max, rx, rx_max); if (local->ops->get_ringparam) local->ops->get_ringparam(&local->hw, tx, tx_max, rx, rx_max); trace_drv_return_void(local); } static inline bool drv_tx_frames_pending(struct ieee80211_local *local) { bool ret = false; might_sleep(); trace_drv_tx_frames_pending(local); if (local->ops->tx_frames_pending) ret = local->ops->tx_frames_pending(&local->hw); trace_drv_return_bool(local, ret); return ret; } static inline int drv_set_bitrate_mask(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct cfg80211_bitrate_mask *mask) { int ret = -EOPNOTSUPP; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_set_bitrate_mask(local, sdata, mask); if (local->ops->set_bitrate_mask) ret = local->ops->set_bitrate_mask(&local->hw, &sdata->vif, mask); trace_drv_return_int(local, ret); return ret; } static inline void drv_set_rekey_data(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_gtk_rekey_data *data) { if (!check_sdata_in_driver(sdata)) return; trace_drv_set_rekey_data(local, sdata, data); if (local->ops->set_rekey_data) local->ops->set_rekey_data(&local->hw, &sdata->vif, data); trace_drv_return_void(local); } static inline void drv_event_callback(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct ieee80211_event *event) { trace_drv_event_callback(local, sdata, event); if (local->ops->event_callback) local->ops->event_callback(&local->hw, &sdata->vif, event); trace_drv_return_void(local); } static inline void drv_release_buffered_frames(struct ieee80211_local *local, struct sta_info *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data) { trace_drv_release_buffered_frames(local, &sta->sta, tids, num_frames, reason, more_data); if (local->ops->release_buffered_frames) local->ops->release_buffered_frames(&local->hw, &sta->sta, tids, num_frames, reason, more_data); trace_drv_return_void(local); } static inline void drv_allow_buffered_frames(struct ieee80211_local *local, struct sta_info *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data) { trace_drv_allow_buffered_frames(local, &sta->sta, tids, num_frames, reason, more_data); if (local->ops->allow_buffered_frames) local->ops->allow_buffered_frames(&local->hw, &sta->sta, tids, num_frames, reason, more_data); trace_drv_return_void(local); } static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_prep_tx_info *info) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); trace_drv_mgd_prepare_tx(local, sdata, info->duration, info->subtype, info->success); if (local->ops->mgd_prepare_tx) local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, info); trace_drv_return_void(local); } static inline void drv_mgd_complete_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_prep_tx_info *info) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); trace_drv_mgd_complete_tx(local, sdata, info->duration, info->subtype, info->success); if (local->ops->mgd_complete_tx) local->ops->mgd_complete_tx(&local->hw, &sdata->vif, info); trace_drv_return_void(local); } static inline void drv_mgd_protect_tdls_discover(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); trace_drv_mgd_protect_tdls_discover(local, sdata); if (local->ops->mgd_protect_tdls_discover) local->ops->mgd_protect_tdls_discover(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_add_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { int ret = -EOPNOTSUPP; might_sleep(); trace_drv_add_chanctx(local, ctx); if (local->ops->add_chanctx) ret = local->ops->add_chanctx(&local->hw, &ctx->conf); trace_drv_return_int(local, ret); if (!ret) ctx->driver_present = true; return ret; } static inline void drv_remove_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { might_sleep(); if (WARN_ON(!ctx->driver_present)) return; trace_drv_remove_chanctx(local, ctx); if (local->ops->remove_chanctx) local->ops->remove_chanctx(&local->hw, &ctx->conf); trace_drv_return_void(local); ctx->driver_present = false; } static inline void drv_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, u32 changed) { might_sleep(); trace_drv_change_chanctx(local, ctx, changed); if (local->ops->change_chanctx) { WARN_ON_ONCE(!ctx->driver_present); local->ops->change_chanctx(&local->hw, &ctx->conf, changed); } trace_drv_return_void(local); } static inline int drv_assign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_chanctx *ctx) { int ret = 0; if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_assign_vif_chanctx(local, sdata, ctx); if (local->ops->assign_vif_chanctx) { WARN_ON_ONCE(!ctx->driver_present); ret = local->ops->assign_vif_chanctx(&local->hw, &sdata->vif, &ctx->conf); } trace_drv_return_int(local, ret); return ret; } static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_chanctx *ctx) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_unassign_vif_chanctx(local, sdata, ctx); if (local->ops->unassign_vif_chanctx) { WARN_ON_ONCE(!ctx->driver_present); local->ops->unassign_vif_chanctx(&local->hw, &sdata->vif, &ctx->conf); } trace_drv_return_void(local); } int drv_switch_vif_chanctx(struct ieee80211_local *local, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode); static inline int drv_start_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret = 0; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_start_ap(local, sdata, &sdata->vif.bss_conf); if (local->ops->start_ap) ret = local->ops->start_ap(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_stop_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { if (!check_sdata_in_driver(sdata)) return; trace_drv_stop_ap(local, sdata); if (local->ops->stop_ap) local->ops->stop_ap(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline void drv_reconfig_complete(struct ieee80211_local *local, enum ieee80211_reconfig_type reconfig_type) { might_sleep(); trace_drv_reconfig_complete(local, reconfig_type); if (local->ops->reconfig_complete) local->ops->reconfig_complete(&local->hw, reconfig_type); trace_drv_return_void(local); } static inline void drv_set_default_unicast_key(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int key_idx) { if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(key_idx < -1 || key_idx > 3); trace_drv_set_default_unicast_key(local, sdata, key_idx); if (local->ops->set_default_unicast_key) local->ops->set_default_unicast_key(&local->hw, &sdata->vif, key_idx); trace_drv_return_void(local); } #if IS_ENABLED(CONFIG_IPV6) static inline void drv_ipv6_addr_change(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct inet6_dev *idev) { trace_drv_ipv6_addr_change(local, sdata); if (local->ops->ipv6_addr_change) local->ops->ipv6_addr_change(&local->hw, &sdata->vif, idev); trace_drv_return_void(local); } #endif static inline void drv_channel_switch_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = sdata->local; if (local->ops->channel_switch_beacon) { trace_drv_channel_switch_beacon(local, sdata, chandef); local->ops->channel_switch_beacon(&local->hw, &sdata->vif, chandef); } } static inline int drv_pre_channel_switch(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { struct ieee80211_local *local = sdata->local; int ret = 0; if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_pre_channel_switch(local, sdata, ch_switch); if (local->ops->pre_channel_switch) ret = local->ops->pre_channel_switch(&local->hw, &sdata->vif, ch_switch); trace_drv_return_int(local, ret); return ret; } static inline int drv_post_channel_switch(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; int ret = 0; if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_post_channel_switch(local, sdata); if (local->ops->post_channel_switch) ret = local->ops->post_channel_switch(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_abort_channel_switch(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; if (!check_sdata_in_driver(sdata)) return; trace_drv_abort_channel_switch(local, sdata); if (local->ops->abort_channel_switch) local->ops->abort_channel_switch(&local->hw, &sdata->vif); } static inline void drv_channel_switch_rx_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { struct ieee80211_local *local = sdata->local; if (!check_sdata_in_driver(sdata)) return; trace_drv_channel_switch_rx_beacon(local, sdata, ch_switch); if (local->ops->channel_switch_rx_beacon) local->ops->channel_switch_rx_beacon(&local->hw, &sdata->vif, ch_switch); } static inline int drv_join_ibss(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret = 0; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_join_ibss(local, sdata, &sdata->vif.bss_conf); if (local->ops->join_ibss) ret = local->ops->join_ibss(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_leave_ibss(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_leave_ibss(local, sdata); if (local->ops->leave_ibss) local->ops->leave_ibss(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, struct sta_info *sta) { u32 ret = 0; trace_drv_get_expected_throughput(&sta->sta); if (local->ops->get_expected_throughput && sta->uploaded) ret = local->ops->get_expected_throughput(&local->hw, &sta->sta); trace_drv_return_u32(local, ret); return ret; } static inline int drv_get_txpower(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int *dbm) { int ret; if (!local->ops->get_txpower) return -EOPNOTSUPP; ret = local->ops->get_txpower(&local->hw, &sdata->vif, dbm); trace_drv_get_txpower(local, sdata, *dbm, ret); return ret; } static inline int drv_tdls_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 oper_class, struct cfg80211_chan_def *chandef, struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie) { int ret; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; if (!local->ops->tdls_channel_switch) return -EOPNOTSUPP; trace_drv_tdls_channel_switch(local, sdata, sta, oper_class, chandef); ret = local->ops->tdls_channel_switch(&local->hw, &sdata->vif, sta, oper_class, chandef, tmpl_skb, ch_sw_tm_ie); trace_drv_return_int(local, ret); return ret; } static inline void drv_tdls_cancel_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; if (!local->ops->tdls_cancel_channel_switch) return; trace_drv_tdls_cancel_channel_switch(local, sdata, sta); local->ops->tdls_cancel_channel_switch(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } static inline void drv_tdls_recv_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_tdls_ch_sw_params *params) { trace_drv_tdls_recv_channel_switch(local, sdata, params); if (local->ops->tdls_recv_channel_switch) local->ops->tdls_recv_channel_switch(&local->hw, &sdata->vif, params); trace_drv_return_void(local); } static inline void drv_wake_tx_queue(struct ieee80211_local *local, struct txq_info *txq) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif); /* In reconfig don't transmit now, but mark for waking later */ if (local->in_reconfig) { set_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txq->flags); return; } if (!check_sdata_in_driver(sdata)) return; trace_drv_wake_tx_queue(local, sdata, txq); local->ops->wake_tx_queue(&local->hw, &txq->txq); } static inline void schedule_and_wake_txq(struct ieee80211_local *local, struct txq_info *txqi) { ieee80211_schedule_txq(&local->hw, &txqi->txq); drv_wake_tx_queue(local, txqi); } static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local, struct sk_buff *head, struct sk_buff *skb) { if (!local->ops->can_aggregate_in_amsdu) return true; return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb); } static inline int drv_get_ftm_responder_stats(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_ftm_responder_stats *ftm_stats) { u32 ret = -EOPNOTSUPP; if (local->ops->get_ftm_responder_stats) ret = local->ops->get_ftm_responder_stats(&local->hw, &sdata->vif, ftm_stats); trace_drv_get_ftm_responder_stats(local, sdata, ftm_stats); return ret; } static inline int drv_start_pmsr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_pmsr_request *request) { int ret = -EOPNOTSUPP; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_start_pmsr(local, sdata); if (local->ops->start_pmsr) ret = local->ops->start_pmsr(&local->hw, &sdata->vif, request); trace_drv_return_int(local, ret); return ret; } static inline void drv_abort_pmsr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_pmsr_request *request) { trace_drv_abort_pmsr(local, sdata); might_sleep(); if (!check_sdata_in_driver(sdata)) return; if (local->ops->abort_pmsr) local->ops->abort_pmsr(&local->hw, &sdata->vif, request); trace_drv_return_void(local); } static inline int drv_start_nan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf) { int ret; might_sleep(); check_sdata_in_driver(sdata); trace_drv_start_nan(local, sdata, conf); ret = local->ops->start_nan(&local->hw, &sdata->vif, conf); trace_drv_return_int(local, ret); return ret; } static inline void drv_stop_nan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); check_sdata_in_driver(sdata); trace_drv_stop_nan(local, sdata); local->ops->stop_nan(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_nan_change_conf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf, u32 changes) { int ret; might_sleep(); check_sdata_in_driver(sdata); if (!local->ops->nan_change_conf) return -EOPNOTSUPP; trace_drv_nan_change_conf(local, sdata, conf, changes); ret = local->ops->nan_change_conf(&local->hw, &sdata->vif, conf, changes); trace_drv_return_int(local, ret); return ret; } static inline int drv_add_nan_func(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct cfg80211_nan_func *nan_func) { int ret; might_sleep(); check_sdata_in_driver(sdata); if (!local->ops->add_nan_func) return -EOPNOTSUPP; trace_drv_add_nan_func(local, sdata, nan_func); ret = local->ops->add_nan_func(&local->hw, &sdata->vif, nan_func); trace_drv_return_int(local, ret); return ret; } static inline void drv_del_nan_func(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u8 instance_id) { might_sleep(); check_sdata_in_driver(sdata); trace_drv_del_nan_func(local, sdata, instance_id); if (local->ops->del_nan_func) local->ops->del_nan_func(&local->hw, &sdata->vif, instance_id); trace_drv_return_void(local); } static inline int drv_set_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct cfg80211_tid_config *tid_conf) { int ret; might_sleep(); ret = local->ops->set_tid_config(&local->hw, &sdata->vif, sta, tid_conf); trace_drv_return_int(local, ret); return ret; } static inline int drv_reset_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 tids) { int ret; might_sleep(); ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tids); trace_drv_return_int(local, ret); return ret; } static inline void drv_update_vif_offload(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); check_sdata_in_driver(sdata); if (!local->ops->update_vif_offload) return; trace_drv_update_vif_offload(local, sdata); local->ops->update_vif_offload(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline void drv_sta_set_4addr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_set_4addr(local, sdata, sta, enabled); if (local->ops->sta_set_4addr) local->ops->sta_set_4addr(&local->hw, &sdata->vif, sta, enabled); trace_drv_return_void(local); } static inline void drv_sta_set_decap_offload(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_set_decap_offload(local, sdata, sta, enabled); if (local->ops->sta_set_decap_offload) local->ops->sta_set_decap_offload(&local->hw, &sdata->vif, sta, enabled); trace_drv_return_void(local); } static inline void drv_add_twt_setup(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_twt_setup *twt) { struct ieee80211_twt_params *twt_agrt; might_sleep(); if (!check_sdata_in_driver(sdata)) return; twt_agrt = (void *)twt->params; trace_drv_add_twt_setup(local, sta, twt, twt_agrt); local->ops->add_twt_setup(&local->hw, sta, twt); trace_drv_return_void(local); } static inline void drv_twt_teardown_request(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 flowid) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; if (!local->ops->twt_teardown_request) return; trace_drv_twt_teardown_request(local, sta, flowid); local->ops->twt_teardown_request(&local->hw, sta, flowid); trace_drv_return_void(local); } #endif /* __MAC80211_DRIVER_OPS */ |
4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> * * Fix by Harrison Xing <harrison@mountainviewdata.com>. * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>. * Extended attributes for symlinks and special files added per * suggestion of Luka Renko <luka.renko@hermes.si>. * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, * Red Hat Inc. * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz * and Andreas Gruenbacher <agruen@suse.de>. */ /* * Extended attributes are stored directly in inodes (on file systems with * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl * field contains the block number if an inode uses an additional block. All * attributes must fit in the inode and one additional block. Blocks that * contain the identical set of attributes may be shared among several inodes. * Identical blocks are detected by keeping a cache of blocks that have * recently been accessed. * * The attributes in inodes and on blocks have a different header; the entries * are stored in the same format: * * +------------------+ * | header | * | entry 1 | | * | entry 2 | | growing downwards * | entry 3 | v * | four null bytes | * | . . . | * | value 1 | ^ * | value 3 | | growing upwards * | value 2 | | * +------------------+ * * The header is followed by multiple entry descriptors. In disk blocks, the * entry descriptors are kept sorted. In inodes, they are unsorted. The * attribute values are aligned to the end of the block in no specific order. * * Locking strategy * ---------------- * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. * EA blocks are only changed if they are exclusive to an inode, so * holding xattr_sem also means that nothing but the EA block's reference * count can change. Multiple writers to the same block are synchronized * by the buffer lock. */ #include <linux/init.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/mbcache.h> #include <linux/quotaops.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "acl.h" #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, fmt, ...) \ printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) \ printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__) #else # define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif static void ext4_xattr_block_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head * ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count); static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, #endif [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif &ext4_xattr_hurd_handler, NULL }; #define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_block_cache) #define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_inode_cache) static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode); #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) { struct ext4_inode_info *ei = EXT4_I(ea_inode); lockdep_set_subclass(&ea_inode->i_rwsem, 1); (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA); } #endif static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le64 dsk_block_nr = cpu_to_le64(block_nr); __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); } static int ext4_xattr_block_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_xattr_header *hdr = BHDR(bh); int ret = 1; if (ext4_has_metadata_csum(inode->i_sb)) { lock_buffer(bh); ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, bh->b_blocknr, hdr)); unlock_buffer(bh); } return ret; } static void ext4_xattr_block_csum_set(struct inode *inode, struct buffer_head *bh) { if (ext4_has_metadata_csum(inode->i_sb)) BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, bh->b_blocknr, BHDR(bh)); } static inline const struct xattr_handler * ext4_xattr_handler(int name_index) { const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; return handler; } static int ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, void *value_start) { struct ext4_xattr_entry *e = entry; /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) return -EFSCORRUPTED; if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) return -EFSCORRUPTED; e = next; } /* Check the values */ while (!IS_LAST_ENTRY(entry)) { u32 size = le32_to_cpu(entry->e_value_size); if (size > EXT4_XATTR_SIZE_MAX) return -EFSCORRUPTED; if (size != 0 && entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); void *value; /* * The value cannot overlap the names, and the value * with padding cannot extend beyond 'end'. Check both * the padded and unpadded sizes, since the size may * overflow to 0 when adding padding. */ if (offs > end - value_start) return -EFSCORRUPTED; value = value_start + offs; if (value < (void *)e + sizeof(u32) || size > end - value || EXT4_XATTR_SIZE(size) > end - value) return -EFSCORRUPTED; } entry = EXT4_XATTR_NEXT(entry); } return 0; } static inline int __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, const char *function, unsigned int line) { int error = -EFSCORRUPTED; if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) goto errout; if (buffer_verified(bh)) return 0; error = -EFSBADCRC; if (!ext4_xattr_block_csum_verify(inode, bh)) goto errout; error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, bh->b_data); errout: if (error) __ext4_error_inode(inode, function, line, 0, -error, "corrupted xattr block %llu", (unsigned long long) bh->b_blocknr); else set_buffer_verified(bh); return error; } #define ext4_xattr_check_block(inode, bh) \ __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) static int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { int error = -EFSCORRUPTED; if (end - (void *)header < sizeof(*header) + sizeof(u32) || (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); errout: if (error) __ext4_error_inode(inode, function, line, 0, -error, "corrupted in-inode xattr"); return error; } #define xattr_check_inode(inode, header, end) \ __xattr_check_inode((inode), (header), (end), __func__, __LINE__) static int xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, void *end, int name_index, const char *name, int sorted) { struct ext4_xattr_entry *entry, *next; size_t name_len; int cmp = 1; if (name == NULL) return -EINVAL; name_len = strlen(name); for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) { next = EXT4_XATTR_NEXT(entry); if ((void *) next >= end) { EXT4_ERROR_INODE(inode, "corrupted xattr entries"); return -EFSCORRUPTED; } cmp = name_index - entry->e_name_index; if (!cmp) cmp = name_len - entry->e_name_len; if (!cmp) cmp = memcmp(name, entry->e_name, name_len); if (cmp <= 0 && (sorted || cmp == 0)) break; } *pentry = entry; return cmp ? -ENODATA : 0; } static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { return ((u64)ea_inode->i_ctime.tv_sec << 32) | (u32) inode_peek_iversion_raw(ea_inode); } static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) { ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); } static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) { return (u32)ea_inode->i_atime.tv_sec; } static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) { ea_inode->i_atime.tv_sec = hash; } /* * Read the EA value from an inode. */ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) { int blocksize = 1 << ea_inode->i_blkbits; int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits; int tail_size = (size % blocksize) ?: blocksize; struct buffer_head *bhs_inline[8]; struct buffer_head **bhs = bhs_inline; int i, ret; if (bh_count > ARRAY_SIZE(bhs_inline)) { bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS); if (!bhs) return -ENOMEM; } ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count, true /* wait */, bhs); if (ret) goto free_bhs; for (i = 0; i < bh_count; i++) { /* There shouldn't be any holes in ea_inode. */ if (!bhs[i]) { ret = -EFSCORRUPTED; goto put_bhs; } memcpy((char *)buf + blocksize * i, bhs[i]->b_data, i < bh_count - 1 ? blocksize : tail_size); } ret = 0; put_bhs: for (i = 0; i < bh_count; i++) brelse(bhs[i]); free_bhs: if (bhs != bhs_inline) kfree(bhs); return ret; } #define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, u32 ea_inode_hash, struct inode **ea_inode) { struct inode *inode; int err; /* * We have to check for this corruption early as otherwise * iget_locked() could wait indefinitely for the state of our * parent inode. */ if (parent->i_ino == ea_ino) { ext4_error(parent->i_sb, "Parent and EA inode have the same ino %lu", ea_ino); return -EFSCORRUPTED; } inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, "error while reading EA inode %lu err=%d", ea_ino, err); return err; } ext4_xattr_inode_set_class(inode); /* * Check whether this is an old Lustre-style xattr inode. Lustre * implementation does not have hash validation, rather it has a * backpointer from ea_inode to the parent inode. */ if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) && EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino && inode->i_generation == parent->i_generation) { ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); ext4_xattr_inode_set_ref(inode, 1); } else { inode_lock_nested(inode, I_MUTEX_XATTR); inode->i_flags |= S_NOQUOTA; inode_unlock(inode); } *ea_inode = inode; return 0; } /* Remove entry from mbcache when EA inode is getting evicted */ void ext4_evict_ea_inode(struct inode *inode) { struct mb_cache_entry *oe; if (!EA_INODE_CACHE(inode)) return; /* Wait for entry to get unused so that we can remove it */ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode), ext4_xattr_inode_get_hash(inode), inode->i_ino))) { mb_cache_entry_wait_unused(oe); mb_cache_entry_put(EA_INODE_CACHE(inode), oe); } } static int ext4_xattr_inode_verify_hashes(struct inode *ea_inode, struct ext4_xattr_entry *entry, void *buffer, size_t size) { u32 hash; /* Verify stored hash matches calculated hash. */ hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); if (hash != ext4_xattr_inode_get_hash(ea_inode)) return -EFSCORRUPTED; if (entry) { __le32 e_hash, tmp_data; /* Verify entry hash. */ tmp_data = cpu_to_le32(hash); e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, &tmp_data, 1); if (e_hash != entry->e_hash) return -EFSCORRUPTED; } return 0; } /* * Read xattr value from the EA inode. */ static int ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, void *buffer, size_t size) { struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); struct inode *ea_inode; int err; err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ea_inode = NULL; goto out; } if (i_size_read(ea_inode) != size) { ext4_warning_inode(ea_inode, "ea_inode file size=%llu entry size=%zu", i_size_read(ea_inode), size); err = -EFSCORRUPTED; goto out; } err = ext4_xattr_inode_read(ea_inode, buffer, size); if (err) goto out; if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) { err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); if (err) { ext4_warning_inode(ea_inode, "EA inode hash validation failed"); goto out; } if (ea_inode_cache) mb_cache_entry_create(ea_inode_cache, GFP_NOFS, ext4_xattr_inode_get_hash(ea_inode), ea_inode->i_ino, true /* reusable */); } out: iput(ea_inode); return err; } static int ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { struct buffer_head *bh = NULL; struct ext4_xattr_entry *entry; size_t size; void *end; int error; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); if (!EXT4_I(inode)->i_file_acl) return -ENODATA; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); end = bh->b_data + bh->b_size; error = xattr_find_entry(inode, &entry, end, name_index, name, 1); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); error = -ERANGE; if (unlikely(size > EXT4_XATTR_SIZE_MAX)) goto cleanup; if (buffer) { if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, entry, buffer, size); if (error) goto cleanup; } else { u16 offset = le16_to_cpu(entry->e_value_offs); void *p = bh->b_data + offset; if (unlikely(p + size > end)) goto cleanup; memcpy(buffer, p, size); } } error = size; cleanup: brelse(bh); return error; } int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; struct ext4_iloc iloc; size_t size; void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return -ENODATA; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; entry = IFIRST(header); error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); error = -ERANGE; if (unlikely(size > EXT4_XATTR_SIZE_MAX)) goto cleanup; if (buffer) { if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, entry, buffer, size); if (error) goto cleanup; } else { u16 offset = le16_to_cpu(entry->e_value_offs); void *p = (void *)IFIRST(header) + offset; if (unlikely(p + size > end)) goto cleanup; memcpy(buffer, p, size); } } error = size; cleanup: brelse(iloc.bh); return error; } /* * ext4_xattr_get() * * Copy an extended attribute into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ int ext4_xattr_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { int error; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (strlen(name) > 255) return -ERANGE; down_read(&EXT4_I(inode)->xattr_sem); error = ext4_xattr_ibody_get(inode, name_index, name, buffer, buffer_size); if (error == -ENODATA) error = ext4_xattr_block_get(inode, name_index, name, buffer, buffer_size); up_read(&EXT4_I(inode)->xattr_sem); return error; } static int ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { const struct xattr_handler *handler = ext4_xattr_handler(entry->e_name_index); if (handler && (!handler->list || handler->list(dentry))) { const char *prefix = handler->prefix ?: handler->name; size_t prefix_len = strlen(prefix); size_t size = prefix_len + entry->e_name_len + 1; if (buffer) { if (size > rest) return -ERANGE; memcpy(buffer, prefix, prefix_len); buffer += prefix_len; memcpy(buffer, entry->e_name, entry->e_name_len); buffer += entry->e_name_len; *buffer++ = 0; } rest -= size; } } return buffer_size - rest; /* total size */ } static int ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); if (!EXT4_I(inode)->i_file_acl) return 0; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); return error; } static int ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: brelse(iloc.bh); return error; } /* * Inode operation listxattr() * * d_inode(dentry)->i_rwsem: don't care * * Copy a list of attribute names into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; down_read(&EXT4_I(d_inode(dentry))->xattr_sem); ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; if (buffer) { buffer += ret; buffer_size -= ret; } ret = ext4_xattr_block_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; ret += ret2; errout: up_read(&EXT4_I(d_inode(dentry))->xattr_sem); return ret; } /* * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is * not set, set it. */ static void ext4_xattr_update_super_block(handle_t *handle, struct super_block *sb) { if (ext4_has_feature_xattr(sb)) return; BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE) == 0) { lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_xattr(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) { struct ext4_iloc iloc = { .bh = NULL }; struct buffer_head *bh = NULL; struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; qsize_t ea_inode_refs = 0; void *end; int ret; lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) goto out; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; ret = xattr_check_inode(inode, header, end); if (ret) goto out; for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) ea_inode_refs++; } if (EXT4_I(inode)->i_file_acl) { bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { ret = PTR_ERR(bh); bh = NULL; goto out; } ret = ext4_xattr_check_block(inode, bh); if (ret) goto out; for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) ea_inode_refs++; } *usage = ea_inode_refs + 1; ret = 0; out: brelse(iloc.bh); brelse(bh); return ret; } static inline size_t round_up_cluster(struct inode *inode, size_t length) { struct super_block *sb = inode->i_sb; size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + inode->i_blkbits); size_t mask = ~(cluster_size - 1); return (length + cluster_size - 1) & mask; } static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) { int err; err = dquot_alloc_inode(inode); if (err) return err; err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); if (err) dquot_free_inode(inode); return err; } static void ext4_xattr_inode_free_quota(struct inode *parent, struct inode *ea_inode, size_t len) { if (ea_inode && ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) return; dquot_free_space_nodirty(parent, round_up_cluster(parent, len)); dquot_free_inode(parent); } int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, struct buffer_head *block_bh, size_t value_len, bool is_create) { int credits; int blocks; /* * 1) Owner inode update * 2) Ref count update on old xattr block * 3) new xattr block * 4) block bitmap update for new xattr block * 5) group descriptor for new xattr block * 6) block bitmap update for old xattr block * 7) group descriptor for old block * * 6 & 7 can happen if we have two racing threads T_a and T_b * which are each trying to set an xattr on inodes I_a and I_b * which were both initially sharing an xattr block. */ credits = 7; /* Quota updates. */ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); /* * In case of inline data, we may push out the data to a block, * so we need to reserve credits for this eventuality */ if (inode && ext4_has_inline_data(inode)) credits += ext4_writepage_trans_blocks(inode) + 1; /* We are done if ea_inode feature is not enabled. */ if (!ext4_has_feature_ea_inode(sb)) return credits; /* New ea_inode, inode map, block bitmap, group descriptor. */ credits += 4; /* Data blocks. */ blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; /* Indirection block or one level of extent tree. */ blocks += 1; /* Block bitmap and group descriptor updates for each block. */ credits += blocks * 2; /* Blocks themselves. */ credits += blocks; if (!is_create) { /* Dereference ea_inode holding old xattr value. * Old ea_inode, inode map, block bitmap, group descriptor. */ credits += 4; /* Data blocks for old ea_inode. */ blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; /* Indirection block or one level of extent tree for old * ea_inode. */ blocks += 1; /* Block bitmap and group descriptor updates for each block. */ credits += blocks * 2; } /* We may need to clone the existing xattr block in which case we need * to increment ref counts for existing ea_inodes referenced by it. */ if (block_bh) { struct ext4_xattr_entry *entry = BFIRST(block_bh); for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) /* Ref count update on ea_inode. */ credits += 1; } return credits; } static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { struct ext4_iloc iloc; s64 ref_count; int ret; inode_lock_nested(ea_inode, I_MUTEX_XATTR); ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); if (ret) goto out; ref_count = ext4_xattr_inode_get_ref(ea_inode); ref_count += ref_change; ext4_xattr_inode_set_ref(ea_inode, ref_count); if (ref_change > 0) { WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", ea_inode->i_ino, ref_count); if (ref_count == 1) { WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); set_nlink(ea_inode, 1); ext4_orphan_del(handle, ea_inode); } } else { WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", ea_inode->i_ino, ref_count); if (ref_count == 0) { WARN_ONCE(ea_inode->i_nlink != 1, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); clear_nlink(ea_inode); ext4_orphan_add(handle, ea_inode); } } ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); if (ret) ext4_warning_inode(ea_inode, "ext4_mark_iloc_dirty() failed ret=%d", ret); out: inode_unlock(ea_inode); return ret; } static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) { return ext4_xattr_inode_update_ref(handle, ea_inode, 1); } static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) { return ext4_xattr_inode_update_ref(handle, ea_inode, -1); } static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, struct ext4_xattr_entry *first) { struct inode *ea_inode; struct ext4_xattr_entry *entry; struct ext4_xattr_entry *failed_entry; unsigned int ea_ino; int err, saved_err; for (entry = first; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) goto cleanup; err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "inc ref error %d", err); iput(ea_inode); goto cleanup; } iput(ea_inode); } return 0; cleanup: saved_err = err; failed_entry = entry; for (entry = first; entry != failed_entry; entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ext4_warning(parent->i_sb, "cleanup ea_ino %u iget error %d", ea_ino, err); continue; } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); } return saved_err; } static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh, bool block_csum, bool dirty) { int error; if (bh && dirty) { if (block_csum) ext4_xattr_block_csum_set(inode, bh); error = ext4_handle_dirty_metadata(handle, NULL, bh); if (error) { ext4_warning(inode->i_sb, "Handle metadata (error %d)", error); return error; } } return 0; } static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, struct ext4_xattr_entry *first, bool block_csum, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits, bool skip_quota) { struct inode *ea_inode; struct ext4_xattr_entry *entry; bool dirty = false; unsigned int ea_ino; int err; int credits; /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; for (entry = first; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) continue; err = ext4_expand_inode_array(ea_inode_array, ea_inode); if (err) { ext4_warning_inode(ea_inode, "Expand inode array err=%d", err); iput(ea_inode); continue; } err = ext4_journal_ensure_credits_fn(handle, credits, credits, ext4_free_metadata_revoke_credits(parent->i_sb, 1), ext4_xattr_restart_fn(handle, parent, bh, block_csum, dirty)); if (err < 0) { ext4_warning_inode(ea_inode, "Ensure credits err=%d", err); continue; } if (err > 0) { err = ext4_journal_get_write_access(handle, parent->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_warning_inode(ea_inode, "Re-get write access err=%d", err); continue; } } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", err); continue; } if (!skip_quota) ext4_xattr_inode_free_quota(parent, ea_inode, le32_to_cpu(entry->e_value_size)); /* * Forget about ea_inode within the same transaction that * decrements the ref count. This avoids duplicate decrements in * case the rest of the work spills over to subsequent * transactions. */ entry->e_value_inum = 0; entry->e_value_size = 0; dirty = true; } if (dirty) { /* * Note that we are deliberately skipping csum calculation for * the final update because we do not expect any journal * restarts until xattr block is freed. */ err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) ext4_warning_inode(parent, "handle dirty metadata err=%d", err); } } /* * Release the xattr block BH: If the reference count is > 1, decrement it; * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; int error = 0; BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (error) goto out; retry_ref: lock_buffer(bh); hash = le32_to_cpu(BHDR(bh)->h_hash); ref = le32_to_cpu(BHDR(bh)->h_refcount); if (ref == 1) { ea_bdebug(bh, "refcount now=0; freeing"); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ if (ea_block_cache) { struct mb_cache_entry *oe; oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, bh->b_blocknr); if (oe) { unlock_buffer(bh); mb_cache_entry_wait_unused(oe); mb_cache_entry_put(ea_block_cache, oe); goto retry_ref; } } get_bh(bh); unlock_buffer(bh); if (ext4_has_feature_ea_inode(inode->i_sb)) ext4_xattr_inode_dec_ref_all(handle, inode, bh, BFIRST(bh), true /* block_csum */, ea_inode_array, extra_credits, true /* skip_quota */); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } else { ref--; BHDR(bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { struct mb_cache_entry *ce; if (ea_block_cache) { ce = mb_cache_entry_get(ea_block_cache, hash, bh->b_blocknr); if (ce) { set_bit(MBE_REUSABLE_B, &ce->e_flags); mb_cache_entry_put(ea_block_cache, ce); } } } ext4_xattr_block_csum_set(inode, bh); /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect * from a race where someone else frees the block (and releases * its journal_head) before we are done dirtying the buffer. In * nojournal mode this race is harmless and we actually cannot * call ext4_handle_dirty_metadata() with locked buffer as * that function can call sync_dirty_buffer() so for that case * we handle the dirtying after unlocking the buffer. */ if (ext4_handle_valid(handle)) error = ext4_handle_dirty_metadata(handle, inode, bh); unlock_buffer(bh); if (!ext4_handle_valid(handle)) error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); } out: ext4_std_error(inode->i_sb, error); return; } /* * Find the available free space for EAs. This also returns the total number of * bytes used by EA entries. */ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; } if (total) *total += EXT4_XATTR_LEN(last->e_name_len); } return (*min_offs - ((void *)last - base) - sizeof(__u32)); } /* * Write the value of the EA in an inode. */ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, const void *buf, int bufsize) { struct buffer_head *bh = NULL; unsigned long block = 0; int blocksize = ea_inode->i_sb->s_blocksize; int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; int csize, wsize = 0; int ret = 0, ret2 = 0; int retries = 0; retry: while (ret >= 0 && ret < max_blocks) { struct ext4_map_blocks map; map.m_lblk = block += ret; map.m_len = max_blocks -= ret; ret = ext4_map_blocks(handle, ea_inode, &map, EXT4_GET_BLOCKS_CREATE); if (ret <= 0) { ext4_mark_inode_dirty(handle, ea_inode); if (ret == -ENOSPC && ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { ret = 0; goto retry; } break; } } if (ret < 0) return ret; block = 0; while (wsize < bufsize) { brelse(bh); csize = (bufsize - wsize) > blocksize ? blocksize : bufsize - wsize; bh = ext4_getblk(handle, ea_inode, block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) { WARN_ON_ONCE(1); EXT4_ERROR_INODE(ea_inode, "ext4_getblk() return bh = NULL"); return -EFSCORRUPTED; } ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh, EXT4_JTR_NONE); if (ret) goto out; memcpy(bh->b_data, buf, csize); /* * Zero out block tail to avoid writing uninitialized memory * to disk. */ if (csize < blocksize) memset(bh->b_data + csize, 0, blocksize - csize); set_buffer_uptodate(bh); ext4_handle_dirty_metadata(handle, ea_inode, bh); buf += csize; wsize += csize; block += 1; } inode_lock(ea_inode); i_size_write(ea_inode, wsize); ext4_update_i_disksize(ea_inode, wsize); inode_unlock(ea_inode); ret2 = ext4_mark_inode_dirty(handle, ea_inode); if (unlikely(ret2 && !ret)) ret = ret2; out: brelse(bh); return ret; } /* * Create an inode to store the value of a large EA. */ static struct inode *ext4_xattr_inode_create(handle_t *handle, struct inode *inode, u32 hash) { struct inode *ea_inode = NULL; uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; int err; if (inode->i_sb->s_root == NULL) { ext4_warning(inode->i_sb, "refuse to create EA inode when umounting"); WARN_ON(1); return ERR_PTR(-EINVAL); } /* * Let the next inode be the goal, so we try and allocate the EA inode * in the same group, or nearby one. */ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG | 0600, NULL, inode->i_ino + 1, owner, EXT4_EA_INODE_FL); if (!IS_ERR(ea_inode)) { ea_inode->i_op = &ext4_file_inode_operations; ea_inode->i_fop = &ext4_file_operations; ext4_set_aops(ea_inode); ext4_xattr_inode_set_class(ea_inode); unlock_new_inode(ea_inode); ext4_xattr_inode_set_ref(ea_inode, 1); ext4_xattr_inode_set_hash(ea_inode, hash); err = ext4_mark_inode_dirty(handle, ea_inode); if (!err) err = ext4_inode_attach_jinode(ea_inode); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); return ERR_PTR(err); } /* * Xattr inodes are shared therefore quota charging is performed * at a higher level. */ dquot_free_inode(ea_inode); dquot_drop(ea_inode); inode_lock(ea_inode); ea_inode->i_flags |= S_NOQUOTA; inode_unlock(ea_inode); } return ea_inode; } static struct inode * ext4_xattr_inode_cache_find(struct inode *inode, const void *value, size_t value_len, u32 hash) { struct inode *ea_inode; struct mb_cache_entry *ce; struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); void *ea_data; if (!ea_inode_cache) return NULL; ce = mb_cache_entry_find_first(ea_inode_cache, hash); if (!ce) return NULL; WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && !(current->flags & PF_MEMALLOC_NOFS)); ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; } while (ce) { ea_inode = ext4_iget(inode->i_sb, ce->e_value, EXT4_IGET_EA_INODE); if (IS_ERR(ea_inode)) goto next_entry; ext4_xattr_inode_set_class(ea_inode); if (i_size_read(ea_inode) == value_len && !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, value_len) && !memcmp(value, ea_data, value_len)) { mb_cache_entry_touch(ea_inode_cache, ce); mb_cache_entry_put(ea_inode_cache, ce); kvfree(ea_data); return ea_inode; } iput(ea_inode); next_entry: ce = mb_cache_entry_find_next(ea_inode_cache, ce); } kvfree(ea_data); return NULL; } /* * Add value of the EA in an inode. */ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, const void *value, size_t value_len, struct inode **ret_inode) { struct inode *ea_inode; u32 hash; int err; hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); if (ea_inode) { err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) { iput(ea_inode); return err; } *ret_inode = ea_inode; return 0; } /* Create an inode for the EA value */ ea_inode = ext4_xattr_inode_create(handle, inode, hash); if (IS_ERR(ea_inode)) return PTR_ERR(ea_inode); err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { ext4_xattr_inode_dec_ref(handle, ea_inode); iput(ea_inode); return err; } if (EA_INODE_CACHE(inode)) mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, ea_inode->i_ino, true /* reusable */); *ret_inode = ea_inode; return 0; } /* * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode * feature is enabled. */ #define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U) static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode, bool is_block) { struct ext4_xattr_entry *last, *next; struct ext4_xattr_entry *here = s->here; size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; struct inode *new_ea_inode = NULL; size_t old_size, new_size; int ret; /* Space used by old and new values. */ old_size = (!s->not_found && !here->e_value_inum) ? EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; /* * Optimization for the simple case when old and new values have the * same padded sizes. Not applicable if external inodes are involved. */ if (new_size && new_size == old_size) { size_t offs = le16_to_cpu(here->e_value_offs); void *val = s->base + offs; here->e_value_size = cpu_to_le32(i->value_len); if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, new_size); } else { memcpy(val, i->value, i->value_len); /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } goto update_hash; } /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = next) { next = EXT4_XATTR_NEXT(last); if ((void *)next >= s->end) { EXT4_ERROR_INODE(inode, "corrupted xattr entries"); ret = -EFSCORRUPTED; goto out; } if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; } } /* Check whether we have enough space. */ if (i->value) { size_t free; free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) free += EXT4_XATTR_LEN(name_len) + old_size; if (free < EXT4_XATTR_LEN(name_len) + new_size) { ret = -ENOSPC; goto out; } /* * If storing the value in an external inode is an option, * reserve space for xattr entries/names in the external * attribute block so that a long value does not occupy the * whole space and prevent further entries being added. */ if (ext4_has_feature_ea_inode(inode->i_sb) && new_size && is_block && (min_offs + old_size - new_size) < EXT4_XATTR_BLOCK_RESERVE(inode)) { ret = -ENOSPC; goto out; } } /* * Getting access to old and new ea inodes is subject to failures. * Finish that work before doing any modifications to the xattr data. */ if (!s->not_found && here->e_value_inum) { ret = ext4_xattr_inode_iget(inode, le32_to_cpu(here->e_value_inum), le32_to_cpu(here->e_hash), &old_ea_inode); if (ret) { old_ea_inode = NULL; goto out; } } if (i->value && in_inode) { WARN_ON_ONCE(!i->value_len); ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); if (ret) goto out; ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, i->value_len, &new_ea_inode); if (ret) { new_ea_inode = NULL; ext4_xattr_inode_free_quota(inode, NULL, i->value_len); goto out; } } if (old_ea_inode) { /* We are ready to release ref count on the old_ea_inode. */ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); if (ret) { /* Release newly required ref count on new_ea_inode. */ if (new_ea_inode) { int err; err = ext4_xattr_inode_dec_ref(handle, new_ea_inode); if (err) ext4_warning_inode(new_ea_inode, "dec ref new_ea_inode err=%d", err); ext4_xattr_inode_free_quota(inode, new_ea_inode, i->value_len); } goto out; } ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); } /* No failures allowed past this point. */ if (!s->not_found && here->e_value_size && !here->e_value_inum) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); void *val = s->base + offs; memmove(first_val + old_size, first_val, val - first_val); memset(first_val, 0, old_size); min_offs += old_size; /* Adjust all value offsets. */ last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); if (!last->e_value_inum && last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + old_size); last = EXT4_XATTR_NEXT(last); } } if (!i->value) { /* Remove old name. */ size_t size = EXT4_XATTR_LEN(name_len); last = ENTRY((void *)last - size); memmove(here, (void *)here + size, (void *)last - (void *)here + sizeof(__u32)); memset(last, 0, size); /* * Update i_inline_off - moved ibody region might contain * system.data attribute. Handling a failure here won't * cause other complications for setting an xattr. */ if (!is_block && ext4_has_inline_data(inode)) { ret = ext4_find_inline_data_nolock(inode); if (ret) { ext4_warning_inode(inode, "unable to update i_inline_off"); goto out; } } } else if (s->not_found) { /* Insert new name. */ size_t size = EXT4_XATTR_LEN(name_len); size_t rest = (void *)last - (void *)here + sizeof(__u32); memmove((void *)here + size, here, rest); memset(here, 0, size); here->e_name_index = i->name_index; here->e_name_len = name_len; memcpy(here->e_name, i->name, name_len); } else { /* This is an update, reset value info. */ here->e_value_inum = 0; here->e_value_offs = 0; here->e_value_size = 0; } if (i->value) { /* Insert new value. */ if (in_inode) { here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); } else if (i->value_len) { void *val = s->base + min_offs - new_size; here->e_value_offs = cpu_to_le16(min_offs - new_size); if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, new_size); } else { memcpy(val, i->value, i->value_len); /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } } here->e_value_size = cpu_to_le32(i->value_len); } update_hash: if (i->value) { __le32 hash = 0; /* Entry hash calculation. */ if (in_inode) { __le32 crc32c_hash; /* * Feed crc32c hash instead of the raw value for entry * hash calculation. This is to avoid walking * potentially long value buffer again. */ crc32c_hash = cpu_to_le32( ext4_xattr_inode_get_hash(new_ea_inode)); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, &crc32c_hash, 1); } else if (is_block) { __le32 *value = s->base + le16_to_cpu( here->e_value_offs); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, value, new_size >> 2); } here->e_hash = hash; } if (is_block) ext4_xattr_rehash((struct ext4_xattr_header *)s->base); ret = 0; out: iput(old_ea_inode); iput(new_ea_inode); return ret; } struct ext4_xattr_block_find { struct ext4_xattr_search s; struct buffer_head *bh; }; static int ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { struct super_block *sb = inode->i_sb; int error; ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", i->name_index, i->name, i->value, (long)i->value_len); if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bs->bh)) { error = PTR_ERR(bs->bh); bs->bh = NULL; return error; } ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); error = ext4_xattr_check_block(inode, bs->bh); if (error) return error; /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; error = xattr_find_entry(inode, &bs->s.here, bs->s.end, i->name_index, i->name, 1); if (error && error != -ENODATA) return error; bs->s.not_found = error; } return 0; } static int ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search s_copy = bs->s; struct ext4_xattr_search *s = &s_copy; struct mb_cache_entry *ce = NULL; int error = 0; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); struct inode *ea_inode = NULL, *tmp_inode; size_t old_ea_inode_quota = 0; unsigned int ea_ino; #define header(x) ((struct ext4_xattr_header *)(x)) if (s->base) { int offset = (char *)s->here - bs->bh->b_data; BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, sb, bs->bh, EXT4_JTR_NONE); if (error) goto cleanup; lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect modified * block */ if (ea_block_cache) { struct mb_cache_entry *oe; oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, bs->bh->b_blocknr); if (oe) { /* * Xattr block is getting reused. Leave * it alone. */ mb_cache_entry_put(ea_block_cache, oe); goto clone_block; } } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) goto bad_block; if (!error) error = ext4_handle_dirty_metadata(handle, inode, bs->bh); if (error) goto cleanup; goto inserted; } clone_block: unlock_buffer(bs->bh); ea_bdebug(bs->bh, "cloning"); s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; s->first = ENTRY(header(s->base)+1); header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); s->end = s->base + bs->bh->b_size; /* * If existing entry points to an xattr inode, we need * to prevent ext4_xattr_set_entry() from decrementing * ref count on it because the reference belongs to the * original block. In this case, make the entry look * like it has an empty value. */ if (!s->not_found && s->here->e_value_inum) { ea_ino = le32_to_cpu(s->here->e_value_inum); error = ext4_xattr_inode_iget(inode, ea_ino, le32_to_cpu(s->here->e_hash), &tmp_inode); if (error) goto cleanup; if (!ext4_test_inode_state(tmp_inode, EXT4_STATE_LUSTRE_EA_INODE)) { /* * Defer quota free call for previous * inode until success is guaranteed. */ old_ea_inode_quota = le32_to_cpu( s->here->e_value_size); } iput(tmp_inode); s->here->e_value_inum = 0; s->here->e_value_size = 0; } } else { /* Allocate a buffer where we construct the new block. */ s->base = kzalloc(sb->s_blocksize, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); header(s->base)->h_blocks = cpu_to_le32(1); header(s->base)->h_refcount = cpu_to_le32(1); s->first = ENTRY(header(s->base)+1); s->here = ENTRY(header(s->base)+1); s->end = s->base + sb->s_blocksize; } error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; if (i->value && s->here->e_value_inum) { /* * A ref count on ea_inode has been taken as part of the call to * ext4_xattr_set_entry() above. We would like to drop this * extra ref but we have to wait until the xattr block is * initialized and has its own ref count on the ea_inode. */ ea_ino = le32_to_cpu(s->here->e_value_inum); error = ext4_xattr_inode_iget(inode, ea_ino, le32_to_cpu(s->here->e_hash), &ea_inode); if (error) { ea_inode = NULL; goto cleanup; } } inserted: if (!IS_LAST_ENTRY(s->first)) { new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { u32 ref; #ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); #endif /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); if (error) goto cleanup; BUFFER_TRACE(new_bh, "get_write_access"); error = ext4_journal_get_write_access( handle, sb, new_bh, EXT4_JTR_NONE); if (error) goto cleanup_dquot; lock_buffer(new_bh); /* * We have to be careful about races with * adding references to xattr block. Once we * hold buffer lock xattr block's state is * stable so we can check the additional * reference fits. */ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; if (ref > EXT4_XATTR_REFCOUNT_MAX) { /* * Undo everything and check mbcache * again. */ unlock_buffer(new_bh); dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); mb_cache_entry_put(ea_block_cache, ce); ce = NULL; new_bh = NULL; goto inserted; } BHDR(new_bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX) clear_bit(MBE_REUSABLE_B, &ce->e_flags); ea_bdebug(new_bh, "reusing; refcount now=%d", ref); ext4_xattr_block_csum_set(inode, new_bh); unlock_buffer(new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) goto cleanup_dquot; } mb_cache_entry_touch(ea_block_cache, ce); mb_cache_entry_put(ea_block_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ ea_bdebug(bs->bh, "keeping this block"); ext4_xattr_block_cache_insert(ea_block_cache, bs->bh); new_bh = bs->bh; get_bh(new_bh); } else { /* We need to allocate a new block */ ext4_fsblk_t goal, block; #ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); #endif goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %llu", (unsigned long long)block); new_bh = sb_getblk(sb, block); if (unlikely(!new_bh)) { error = -ENOMEM; getblk_failed: ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA); goto cleanup; } error = ext4_xattr_inode_inc_ref_all(handle, inode, ENTRY(header(s->base)+1)); if (error) goto getblk_failed; if (ea_inode) { /* Drop the extra ref on ea_inode. */ error = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error) ext4_warning_inode(ea_inode, "dec ref error=%d", error); iput(ea_inode); ea_inode = NULL; } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, sb, new_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(new_bh); error = -EIO; goto getblk_failed; } memcpy(new_bh->b_data, s->base, new_bh->b_size); ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_block_cache_insert(ea_block_cache, new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) goto cleanup; } } if (old_ea_inode_quota) ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota); /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; /* Drop the previous xattr block. */ if (bs->bh && bs->bh != new_bh) { struct ext4_xattr_inode_array *ea_inode_array = NULL; ext4_xattr_release_block(handle, inode, bs->bh, &ea_inode_array, 0 /* extra_credits */); ext4_xattr_inode_array_free(ea_inode_array); } error = 0; cleanup: if (ea_inode) { int error2; error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error2) ext4_warning_inode(ea_inode, "dec ref error=%d", error2); /* If there was an error, revert the quota charge. */ if (error) ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); iput(ea_inode); } if (ce) mb_cache_entry_put(ea_block_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); return error; cleanup_dquot: dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); goto cleanup; bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); goto cleanup; #undef header } int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return 0; raw_inode = ext4_raw_inode(&is->iloc); header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = xattr_check_inode(inode, header, is->s.end); if (error) return error; /* Find the named attribute. */ error = xattr_find_entry(inode, &is->s.here, is->s.end, i->name_index, i->name, 0); if (error && error != -ENODATA) return error; is->s.not_found = error; } return 0; } int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); ext4_set_inode_state(inode, EXT4_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } return 0; } static int ext4_xattr_value_same(struct ext4_xattr_search *s, struct ext4_xattr_info *i) { void *value; /* When e_value_inum is set the value is stored externally. */ if (s->here->e_value_inum) return 0; if (le32_to_cpu(s->here->e_value_size) != i->value_len) return 0; value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); return !memcmp(value, i->value, i->value_len); } static struct buffer_head *ext4_xattr_get_block(struct inode *inode) { struct buffer_head *bh; int error; if (!EXT4_I(inode)->i_file_acl) return NULL; bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return bh; error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); return ERR_PTR(error); } return bh; } /* * ext4_xattr_set_handle() * * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE * specify that an extended attribute must exist and must not exist * previous to the call, respectively. * * Returns 0, or a negative error number on failure. */ int ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { struct ext4_xattr_info i = { .name_index = name_index, .name = name, .value = value, .value_len = value_len, .in_inode = 0, }; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; int no_expand; int error; if (!name) return -EINVAL; if (strlen(name) > 255) return -ERANGE; ext4_write_lock_xattr(inode, &no_expand); /* Check journal credits under write lock. */ if (ext4_handle_valid(handle)) { struct buffer_head *bh; int credits; bh = ext4_xattr_get_block(inode); if (IS_ERR(bh)) { error = PTR_ERR(bh); goto cleanup; } credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, value_len, flags & XATTR_CREATE); brelse(bh); if (jbd2_handle_buffer_credits(handle) < credits) { error = -ENOSPC; goto cleanup; } WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ext4_clear_inode_state(inode, EXT4_STATE_NEW); } error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto cleanup; if (is.s.not_found) error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; if (is.s.not_found && bs.s.not_found) { error = -ENODATA; if (flags & XATTR_REPLACE) goto cleanup; error = 0; if (!value) goto cleanup; } else { error = -EEXIST; if (flags & XATTR_CREATE) goto cleanup; } if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { error = 0; /* Xattr value did not change? Save us some work and bail out */ if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i)) goto cleanup; if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; if (ext4_has_feature_ea_inode(inode->i_sb) && (EXT4_XATTR_SIZE(i.value_len) > EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) i.in_inode = 1; retry_inode: error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); } else if (error == -ENOSPC) { if (EXT4_I(inode)->i_file_acl && !bs.s.base) { brelse(bs.bh); bs.bh = NULL; error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); if (!error && !is.s.not_found) { i.value = NULL; error = ext4_xattr_ibody_set(handle, inode, &i, &is); } else if (error == -ENOSPC) { /* * Xattr does not fit in the block, store at * external inode if possible. */ if (ext4_has_feature_ea_inode(inode->i_sb) && i.value_len && !i.in_inode) { i.in_inode = 1; goto retry_inode; } } } } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = current_time(inode); if (!value) no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with * error != 0. */ is.iloc.bh = NULL; if (IS_SYNC(inode)) ext4_handle_sync(handle); } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); brelse(bs.bh); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_xattr_set_credits(struct inode *inode, size_t value_len, bool is_create, int *credits) { struct buffer_head *bh; int err; *credits = 0; if (!EXT4_SB(inode->i_sb)->s_journal) return 0; down_read(&EXT4_I(inode)->xattr_sem); bh = ext4_xattr_get_block(inode); if (IS_ERR(bh)) { err = PTR_ERR(bh); } else { *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, value_len, is_create); brelse(bh); err = 0; } up_read(&EXT4_I(inode)->xattr_sem); return err; } /* * ext4_xattr_set() * * Like ext4_xattr_set_handle, but start from an inode. This extended * attribute modification is a filesystem transaction by itself. * * Returns 0, or a negative error number on failure. */ int ext4_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { handle_t *handle; struct super_block *sb = inode->i_sb; int error, retries = 0; int credits; error = dquot_initialize(inode); if (error) return error; retry: error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE, &credits); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { int error2; error = ext4_xattr_set_handle(handle, inode, name_index, name, value, value_len, flags); ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; if (error == 0) error = error2; } return error; } /* * Shift the EA entries in the inode to create space for the increased * i_extra_isize. */ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, int value_offs_shift, void *to, void *from, size_t n) { struct ext4_xattr_entry *last = entry; int new_offs; /* We always shift xattr headers further thus offsets get lower */ BUG_ON(value_offs_shift > 0); /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (!last->e_value_inum && last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; last->e_value_offs = cpu_to_le16(new_offs); } } /* Shift the entries by n bytes */ memmove(to, from, n); } /* * Move xattr pointed to by 'entry' from inode into external xattr block */ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode, struct ext4_xattr_entry *entry) { struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; size_t value_size = le32_to_cpu(entry->e_value_size); struct ext4_xattr_info i = { .value = NULL, .value_len = 0, .name_index = entry->e_name_index, .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); int needs_kvfree = 0; int error; is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); if (!is || !bs || !b_entry_name) { error = -ENOMEM; goto out; } is->s.not_found = -ENODATA; bs->s.not_found = -ENODATA; is->iloc.bh = NULL; bs->bh = NULL; /* Save the entry name and the entry value */ if (entry->e_value_inum) { buffer = kvmalloc(value_size, GFP_NOFS); if (!buffer) { error = -ENOMEM; goto out; } needs_kvfree = 1; error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; } else { size_t value_offs = le16_to_cpu(entry->e_value_offs); buffer = (void *)IFIRST(header) + value_offs; } memcpy(b_entry_name, entry->e_name, entry->e_name_len); b_entry_name[entry->e_name_len] = '\0'; i.name = b_entry_name; error = ext4_get_inode_loc(inode, &is->iloc); if (error) goto out; error = ext4_xattr_ibody_find(inode, &i, is); if (error) goto out; i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); if (error) goto out; /* Move ea entry from the inode into the block */ error = ext4_xattr_block_set(handle, inode, &i, bs); if (error) goto out; /* Remove the chosen entry from the inode */ i.value = NULL; i.value_len = 0; error = ext4_xattr_ibody_set(handle, inode, &i, is); out: kfree(b_entry_name); if (needs_kvfree && buffer) kvfree(buffer); if (is) brelse(is->iloc.bh); if (bs) brelse(bs->bh); kfree(is); kfree(bs); return error; } static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode, int isize_diff, size_t ifree, size_t bfree, int *total_ino) { struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); struct ext4_xattr_entry *small_entry; struct ext4_xattr_entry *entry; struct ext4_xattr_entry *last; unsigned int entry_size; /* EA entry size */ unsigned int total_size; /* EA entry size + value size */ unsigned int min_total_size; int error; while (isize_diff > ifree) { entry = NULL; small_entry = NULL; min_total_size = ~0U; last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { /* never move system.data out of the inode */ if ((last->e_name_len == 4) && (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) && !memcmp(last->e_name, "data", 4)) continue; total_size = EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_inum) total_size += EXT4_XATTR_SIZE( le32_to_cpu(last->e_value_size)); if (total_size <= bfree && total_size < min_total_size) { if (total_size + ifree < isize_diff) { small_entry = last; } else { entry = last; min_total_size = total_size; } } } if (entry == NULL) { if (small_entry == NULL) return -ENOSPC; entry = small_entry; } entry_size = EXT4_XATTR_LEN(entry->e_name_len); total_size = entry_size; if (!entry->e_value_inum) total_size += EXT4_XATTR_SIZE( le32_to_cpu(entry->e_value_size)); error = ext4_xattr_move_to_block(handle, inode, raw_inode, entry); if (error) return error; *total_ino -= entry_size; ifree += total_size; bfree -= total_size; } return 0; } /* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. */ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); static unsigned int mnt_count; size_t min_offs; size_t ifree, bfree; int total_ino; void *base, *end; int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) return 0; header = IHDR(inode, raw_inode); /* * Check if enough free space is available in the inode to shift the * entries ahead by new_extra_isize. */ base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); error = xattr_check_inode(inode, header, end); if (error) goto cleanup; ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; /* * Enough free space isn't available in the inode, check if * EA block can hold new_extra_isize bytes. */ if (EXT4_I(inode)->i_file_acl) { struct buffer_head *bh; bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); goto cleanup; } error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); goto cleanup; } base = BHDR(bh); end = bh->b_data + bh->b_size; min_offs = end - base; bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, NULL); brelse(bh); if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; goto retry; } error = -ENOSPC; goto cleanup; } } else { bfree = inode->i_sb->s_blocksize; } error = ext4_xattr_make_inode_space(handle, inode, raw_inode, isize_diff, ifree, bfree, &total_ino); if (error) { if (error == -ENOSPC && !tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; goto retry; } goto cleanup; } shift: /* Adjust the offsets and shift the remaining entries ahead */ ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, (void *)header, total_ino); EXT4_I(inode)->i_extra_isize = new_extra_isize; if (ext4_has_inline_data(inode)) error = ext4_find_inline_data_nolock(inode); cleanup: if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", inode->i_ino); mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); } return error; } #define EIA_INCR 16 /* must be 2^n */ #define EIA_MASK (EIA_INCR - 1) /* Add the large xattr @inode into @ea_inode_array for deferred iput(). * If @ea_inode_array is new or full it will be grown and the old * contents copied over. */ static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode) { if (*ea_inode_array == NULL) { /* * Start with 15 inodes, so it fits into a power-of-two size. * If *ea_inode_array is NULL, this is essentially offsetof() */ (*ea_inode_array) = kmalloc(offsetof(struct ext4_xattr_inode_array, inodes[EIA_MASK]), GFP_NOFS); if (*ea_inode_array == NULL) return -ENOMEM; (*ea_inode_array)->count = 0; } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { /* expand the array once all 15 + n * 16 slots are full */ struct ext4_xattr_inode_array *new_array = NULL; int count = (*ea_inode_array)->count; /* if new_array is NULL, this is essentially offsetof() */ new_array = kmalloc( offsetof(struct ext4_xattr_inode_array, inodes[count + EIA_INCR]), GFP_NOFS); if (new_array == NULL) return -ENOMEM; memcpy(new_array, *ea_inode_array, offsetof(struct ext4_xattr_inode_array, inodes[count])); kfree(*ea_inode_array); *ea_inode_array = new_array; } (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; return 0; } /* * ext4_xattr_delete_inode() * * Free extended attribute resources associated with this inode. Traverse * all entries and decrement reference on any xattr inodes associated with this * inode. This is called immediately before an inode is freed. We have exclusive * access to the inode. If an orphan inode is deleted it will also release its * references on xattr block and xattr inodes. */ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct buffer_head *bh = NULL; struct ext4_xattr_ibody_header *header; struct ext4_iloc iloc = { .bh = NULL }; struct ext4_xattr_entry *entry; struct inode *ea_inode; int error; error = ext4_journal_ensure_credits(handle, extra_credits, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (error < 0) { EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } if (ext4_has_feature_ea_inode(inode->i_sb) && ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = ext4_get_inode_loc(inode, &iloc); if (error) { EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); goto cleanup; } error = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (error) { EXT4_ERROR_INODE(inode, "write access (error %d)", error); goto cleanup; } header = IHDR(inode, ext4_raw_inode(&iloc)); if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, IFIRST(header), false /* block_csum */, ea_inode_array, extra_credits, false /* skip_quota */); } if (EXT4_I(inode)->i_file_acl) { bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); if (error == -EIO) { EXT4_ERROR_INODE_ERR(inode, EIO, "block %llu read error", EXT4_I(inode)->i_file_acl); } bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; if (ext4_has_feature_ea_inode(inode->i_sb)) { for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; error = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), &ea_inode); if (error) continue; ext4_xattr_inode_free_quota(inode, ea_inode, le32_to_cpu(entry->e_value_size)); iput(ea_inode); } } ext4_xattr_release_block(handle, inode, bh, ea_inode_array, extra_credits); /* * Update i_file_acl value in the same transaction that releases * block. */ EXT4_I(inode)->i_file_acl = 0; error = ext4_mark_inode_dirty(handle, inode); if (error) { EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", error); goto cleanup; } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: brelse(iloc.bh); brelse(bh); return error; } void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) { int idx; if (ea_inode_array == NULL) return; for (idx = 0; idx < ea_inode_array->count; ++idx) iput(ea_inode_array->inodes[idx]); kfree(ea_inode_array); } /* * ext4_xattr_block_cache_insert() * * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. * * Returns 0, or a negative error number on failure. */ static void ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, struct buffer_head *bh) { struct ext4_xattr_header *header = BHDR(bh); __u32 hash = le32_to_cpu(header->h_hash); int reusable = le32_to_cpu(header->h_refcount) < EXT4_XATTR_REFCOUNT_MAX; int error; if (!ea_block_cache) return; error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) ea_bdebug(bh, "already in cache"); } else ea_bdebug(bh, "inserting [%x]", (int)hash); } /* * ext4_xattr_cmp() * * Compare two extended attribute blocks for equality. * * Returns 0 if the blocks are equal, 1 if they differ, and * a negative error number on errors. */ static int ext4_xattr_cmp(struct ext4_xattr_header *header1, struct ext4_xattr_header *header2) { struct ext4_xattr_entry *entry1, *entry2; entry1 = ENTRY(header1+1); entry2 = ENTRY(header2+1); while (!IS_LAST_ENTRY(entry1)) { if (IS_LAST_ENTRY(entry2)) return 1; if (entry1->e_hash != entry2->e_hash || entry1->e_name_index != entry2->e_name_index || entry1->e_name_len != entry2->e_name_len || entry1->e_value_size != entry2->e_value_size || entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; if (!entry1->e_value_inum && memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) return 1; entry1 = EXT4_XATTR_NEXT(entry1); entry2 = EXT4_XATTR_NEXT(entry2); } if (!IS_LAST_ENTRY(entry2)) return 1; return 0; } /* * ext4_xattr_block_cache_find() * * Find an identical extended attribute block. * * Returns a pointer to the block found, or NULL if such a block was * not found or an error occurred. */ static struct buffer_head * ext4_xattr_block_cache_find(struct inode *inode, struct ext4_xattr_header *header, struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (!ea_block_cache) return NULL; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { if (PTR_ERR(bh) == -ENOMEM) { mb_cache_entry_put(ea_block_cache, ce); return NULL; } bh = NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; } brelse(bh); ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } #define NAME_HASH_SHIFT 5 #define VALUE_HASH_SHIFT 16 /* * ext4_xattr_hash_entry() * * Compute the hash of an extended attribute. */ static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count) { __u32 hash = 0; while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ *name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ le32_to_cpu(*value++); } return cpu_to_le32(hash); } #undef NAME_HASH_SHIFT #undef VALUE_HASH_SHIFT #define BLOCK_HASH_SHIFT 16 /* * ext4_xattr_rehash() * * Re-compute the extended attribute hash value after an entry has changed. */ static void ext4_xattr_rehash(struct ext4_xattr_header *header) { struct ext4_xattr_entry *here; __u32 hash = 0; here = ENTRY(header+1); while (!IS_LAST_ENTRY(here)) { if (!here->e_hash) { /* Block is not shared if an entry's hash value == 0 */ hash = 0; break; } hash = (hash << BLOCK_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ le32_to_cpu(here->e_hash); here = EXT4_XATTR_NEXT(here); } header->h_hash = cpu_to_le32(hash); } #undef BLOCK_HASH_SHIFT #define HASH_BUCKET_BITS 10 struct mb_cache * ext4_xattr_create_cache(void) { return mb_cache_create(HASH_BUCKET_BITS); } void ext4_xattr_destroy_cache(struct mb_cache *cache) { if (cache) mb_cache_destroy(cache); } |
19 19 19 19 19 19 200 201 21 22 887 887 13 13 50 50 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle firewalling * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> * Bart De Schuymer <bdschuym@pandora.be> * * Lennert dedicates this file to Kerstin Wurdinger. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/netfilter_bridge.h> #include <uapi/linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_arp.h> #include <linux/in_route.h> #include <linux/rculist.h> #include <linux/inetdevice.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/dst_metadata.h> #include <net/route.h> #include <net/netfilter/br_netfilter.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include "br_private.h" #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #endif static unsigned int brnf_net_id __read_mostly; struct brnf_net { bool enabled; #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_hdr; #endif /* default value is 1 */ int call_iptables; int call_ip6tables; int call_arptables; /* default value is 0 */ int filter_vlan_tagged; int filter_pppoe_tagged; int pass_vlan_indev; }; #define IS_IP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) #define IS_IPV6(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6)) #define IS_ARP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP)) static inline __be16 vlan_proto(const struct sk_buff *skb) { if (skb_vlan_tag_present(skb)) return skb->protocol; else if (skb->protocol == htons(ETH_P_8021Q)) return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; else return 0; } static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged; } static inline bool is_vlan_ipv6(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_IPV6) && brnet->filter_vlan_tagged; } static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged; } static inline __be16 pppoe_proto(const struct sk_buff *skb) { return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + sizeof(struct pppoe_hdr))); } static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return skb->protocol == htons(ETH_P_PPP_SES) && pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged; } static inline bool is_pppoe_ipv6(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return skb->protocol == htons(ETH_P_PPP_SES) && pppoe_proto(skb) == htons(PPP_IPV6) && brnet->filter_pppoe_tagged; } /* largest possible L2 header, see br_nf_dev_queue_xmit() */ #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) struct brnf_frag_data { char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; u8 encap_size; u8 size; u16 vlan_tci; __be16 vlan_proto; }; static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); static void nf_bridge_info_free(struct sk_buff *skb) { skb_ext_del(skb, SKB_EXT_BRIDGE_NF); } static inline struct net_device *bridge_parent(const struct net_device *dev) { struct net_bridge_port *port; port = br_port_get_rcu(dev); return port ? port->br->dev : NULL; } static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) { return skb_ext_add(skb, SKB_EXT_BRIDGE_NF); } unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) { switch (skb->protocol) { case __cpu_to_be16(ETH_P_8021Q): return VLAN_HLEN; case __cpu_to_be16(ETH_P_PPP_SES): return PPPOE_SES_HLEN; default: return 0; } } static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) { unsigned int len = nf_bridge_encap_header_len(skb); skb_pull(skb, len); skb->network_header += len; } static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) { unsigned int len = nf_bridge_encap_header_len(skb); skb_pull_rcsum(skb, len); skb->network_header += len; } /* When handing a packet over to the IP layer * check whether we have a skb that is in the * expected format */ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) { const struct iphdr *iph; u32 len; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* Basic sanity checks */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = ntohs(iph->tot_len); if (skb->len < len) { __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); /* We should really parse IP options here but until * somebody who actually uses IP options complains to * us we'll just silently ignore the options because * we're lazy! */ return 0; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: return -1; } void nf_bridge_update_protocol(struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); switch (nf_bridge->orig_proto) { case BRNF_PROTO_8021Q: skb->protocol = htons(ETH_P_8021Q); break; case BRNF_PROTO_PPPOE: skb->protocol = htons(ETH_P_PPP_SES); break; case BRNF_PROTO_UNCHANGED: break; } } /* Obtain the correct destination MAC address, while preserving the original * source MAC address. If we already know this address, we just copy it. If we * don't, we use the neighbour framework to find out. In both cases, we make * sure that br_handle_frame_finish() is called afterwards. */ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_buff *skb) { struct neighbour *neigh; struct dst_entry *dst; skb->dev = bridge_parent(skb->dev); if (!skb->dev) goto free_skb; dst = skb_dst(skb); neigh = dst_neigh_lookup_skb(dst, skb); if (neigh) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); int ret; if ((neigh->nud_state & NUD_CONNECTED) && neigh->hh.hh_len) { neigh_hh_bridge(&neigh->hh, skb); skb->dev = nf_bridge->physindev; ret = br_handle_frame_finish(net, sk, skb); } else { /* the neighbour function below overwrites the complete * MAC header, so we save the Ethernet source address and * protocol number. */ skb_copy_from_linear_data_offset(skb, -(ETH_HLEN-ETH_ALEN), nf_bridge->neigh_header, ETH_HLEN-ETH_ALEN); /* tell br_dev_xmit to continue with forwarding */ nf_bridge->bridged_dnat = 1; /* FIXME Need to refragment */ ret = neigh->output(neigh, skb); } neigh_release(neigh); return ret; } free_skb: kfree_skb(skb); return 0; } static inline bool br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, const struct nf_bridge_info *nf_bridge) { return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; } /* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. * This is also true when SNAT takes place (for the reply direction). * * There are two cases to consider: * 1. The packet was DNAT'ed to a device in the same bridge * port group as it was received on. We can still bridge * the packet. * 2. The packet was DNAT'ed to a different device, either * a non-bridged device or another bridge port group. * The packet will need to be routed. * * The correct way of distinguishing between these two cases is to * call ip_route_input() and to look at skb->dst->dev, which is * changed to the destination device if ip_route_input() succeeds. * * Let's first consider the case that ip_route_input() succeeds: * * If the output device equals the logical bridge device the packet * came in on, we can consider this bridging. The corresponding MAC * address will be obtained in br_nf_pre_routing_finish_bridge. * Otherwise, the packet is considered to be routed and we just * change the destination MAC address so that the packet will * later be passed up to the IP stack to be routed. For a redirected * packet, ip_route_input() will give back the localhost as output device, * which differs from the bridge device. * * Let's now consider the case that ip_route_input() fails: * * This can be because the destination address is martian, in which case * the packet will be dropped. * If IP forwarding is disabled, ip_route_input() will fail, while * ip_route_output_key() can return success. The source * address for ip_route_output_key() is set to zero, so ip_route_output_key() * thinks we're handling a locally generated packet and won't care * if IP forwarding is enabled. If the output device equals the logical bridge * device, we proceed as if ip_route_input() succeeded. If it differs from the * logical bridge port or if ip_route_output_key() fails we drop the packet. */ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct iphdr *iph = ip_hdr(skb); struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; int err; nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* If err equals -EHOSTUNREACH the error is due to a * martian destination or due to the fact that * forwarding is disabled. For most martian packets, * ip_route_output_key() will fail. It won't fail for 2 types of * martian destinations: loopback destinations and destination * 0.0.0.0. In both cases the packet will be dropped because the * destination is the loopback device and not the bridge. */ if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) goto free_skb; rt = ip_route_output(net, iph->daddr, 0, RT_TOS(iph->tos), 0); if (!IS_ERR(rt)) { /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. */ if (rt->dst.dev == dev) { skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); goto bridged_dnat; } ip_rt_put(rt); } free_skb: kfree_skb(skb); return 0; } else { if (skb_dst(skb)->dev == dev) { bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); skb->pkt_type = PACKET_HOST; } } else { rt = bridge_parent_rtable(nf_bridge->physindev); if (!rt) { kfree_skb(skb); return 0; } skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_handle_frame_finish); return 0; } static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev, const struct net *net) { struct net_device *vlan, *br; struct brnf_net *brnet = net_generic(net, brnf_net_id); br = bridge_parent(dev); if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) return br; vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, skb_vlan_tag_get(skb) & VLAN_VID_MASK); return vlan ? vlan : br; } /* Some common code for IPv4/IPv6 */ struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } nf_bridge->in_prerouting = 1; nf_bridge->physindev = skb->dev; skb->dev = brnf_get_logical_dev(skb, skb->dev, net); if (skb->protocol == htons(ETH_P_8021Q)) nf_bridge->orig_proto = BRNF_PROTO_8021Q; else if (skb->protocol == htons(ETH_P_PPP_SES)) nf_bridge->orig_proto = BRNF_PROTO_PPPOE; /* Must drop socket now because of tproxy. */ skb_orphan(skb); return skb->dev; } /* Direct IPv6 traffic to br_nf_pre_routing_ipv6. * Replicate the checks that IPv4 does on packet reception. * Set skb->dev to the bridge device (i.e. parent of the * receiving device) to make netfilter happy, the REDIRECT * target in particular. Save the original destination IP * address to be able to detect DNAT afterwards. */ static unsigned int br_nf_pre_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge; struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); struct brnf_net *brnet; if (unlikely(!pskb_may_pull(skb, len))) return NF_DROP; p = br_port_get_rcu(state->in); if (p == NULL) return NF_DROP; br = p->br; brnet = net_generic(state->net, brnf_net_id); if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) { if (!brnet->call_ip6tables && !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; if (!ipv6_mod_enabled()) { pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported."); return NF_DROP; } nf_bridge_pull_encap_header_rcsum(skb); return br_nf_pre_routing_ipv6(priv, skb, state); } if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) return NF_ACCEPT; if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) && !is_pppoe_ip(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); if (br_validate_ipv4(state->net, skb)) return NF_DROP; if (!nf_bridge_alloc(skb)) return NF_DROP; if (!setup_pre_routing(skb, state->net)) return NF_DROP; nf_bridge = nf_bridge_info_get(skb); nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; skb->protocol = htons(ETH_P_IP); skb->transport_header = skb->network_header + ip_hdr(skb)->ihl * 4; NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb, skb->dev, NULL, br_nf_pre_routing_finish); return NF_STOLEN; } #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* conntracks' nf_confirm logic cannot handle cloned skbs referencing * the same nf_conn entry, which will happen for multicast (broadcast) * Frames on bridges. * * Example: * macvlan0 * br0 * ethX ethY * * ethX (or Y) receives multicast or broadcast packet containing * an IP packet, not yet in conntrack table. * * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting. * -> skb->_nfct now references a unconfirmed entry * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge * interface. * 3. skb gets passed up the stack. * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb * and schedules a work queue to send them out on the lower devices. * * The clone skb->_nfct is not a copy, it is the same entry as the * original skb. The macvlan rx handler then returns RX_HANDLER_PASS. * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb. * * The Macvlan broadcast worker and normal confirm path will race. * * This race will not happen if step 2 already confirmed a clone. In that * case later steps perform skb_clone() with skb->_nfct already confirmed (in * hash table). This works fine. * * But such confirmation won't happen when eb/ip/nftables rules dropped the * packets before they reached the nf_confirm step in postrouting. * * Work around this problem by explicit confirmation of the entry at * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed * entry. * */ static unsigned int br_nf_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { bool promisc = BR_INPUT_SKB_CB(skb)->promisc; struct nf_conntrack *nfct = skb_nfct(skb); const struct nf_ct_hook *ct_hook; struct nf_conn *ct; int ret; if (promisc) { nf_reset_ct(skb); return NF_ACCEPT; } if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; ct = container_of(nfct, struct nf_conn, ct_general); if (likely(nf_ct_is_confirmed(ct))) return NF_ACCEPT; if (WARN_ON_ONCE(refcount_read(&nfct->use) != 1)) { nf_reset_ct(skb); return NF_ACCEPT; } WARN_ON_ONCE(skb_shared(skb)); /* We can't call nf_confirm here, it would create a dependency * on nf_conntrack module. */ ct_hook = rcu_dereference(nf_ct_hook); if (!ct_hook) { skb->_nfct = 0ul; nf_conntrack_put(nfct); return NF_ACCEPT; } nf_bridge_pull_encap_header(skb); ret = ct_hook->confirm(skb); switch (ret & NF_VERDICT_MASK) { case NF_STOLEN: return NF_STOLEN; default: nf_bridge_push_encap_header(skb); break; } ct = container_of(nfct, struct nf_conn, ct_general); WARN_ON_ONCE(!nf_ct_is_confirmed(ct)); return ret; } #endif /* PF_BRIDGE/FORWARD *************************************************/ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *in; if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) { if (skb->protocol == htons(ETH_P_IP)) nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (skb->protocol == htons(ETH_P_IPV6)) nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; in = nf_bridge->physindev; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge_update_protocol(skb); } else { in = *((struct net_device **)(skb->cb)); } nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev, br_forward_finish); return 0; } /* This is the 'purely bridged' case. For IP, we pass the packet to * netfilter with indev and outdev set to the bridge device, * but we are still able to filter on the 'real' indev/outdev * because of the physdev module. For ARP, indev and outdev are the * bridge ports. */ static unsigned int br_nf_forward_ip(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge; struct net_device *parent; u_int8_t pf; nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return NF_ACCEPT; /* Need exclusive nf_bridge_info since we might have multiple * different physoutdevs. */ if (!nf_bridge_unshare(skb)) return NF_DROP; nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return NF_DROP; parent = bridge_parent(state->out); if (!parent) return NF_DROP; if (IS_IP(skb) || is_vlan_ip(skb, state->net) || is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; nf_bridge_pull_encap_header(skb); if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } if (pf == NFPROTO_IPV4) { if (br_validate_ipv4(state->net, skb)) return NF_DROP; IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; } if (pf == NFPROTO_IPV6) { if (br_validate_ipv6(state->net, skb)) return NF_DROP; IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; } nf_bridge->physoutdev = skb->dev; if (pf == NFPROTO_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb, brnf_get_logical_dev(skb, state->in, state->net), parent, br_nf_forward_finish); return NF_STOLEN; } static unsigned int br_nf_forward_arp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct net_bridge_port *p; struct net_bridge *br; struct net_device **d = (struct net_device **)(skb->cb); struct brnf_net *brnet; p = br_port_get_rcu(state->out); if (p == NULL) return NF_ACCEPT; br = p->br; brnet = net_generic(state->net, brnf_net_id); if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) return NF_ACCEPT; if (!IS_ARP(skb)) { if (!is_vlan_arp(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header(skb); } if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr)))) return NF_DROP; if (arp_hdr(skb)->ar_pln != 4) { if (is_vlan_arp(skb, state->net)) nf_bridge_push_encap_header(skb); return NF_ACCEPT; } *d = state->in; NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->net, state->sk, skb, state->in, state->out, br_nf_forward_finish); return NF_STOLEN; } static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct brnf_frag_data *data; int err; data = this_cpu_ptr(&brnf_frag_data_storage); err = skb_cow_head(skb, data->size); if (err) { kfree_skb(skb); return 0; } if (data->vlan_proto) __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); __skb_push(skb, data->encap_size); nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } static int br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { unsigned int mtu = ip_skb_dst_mtu(sk, skb); struct iphdr *iph = ip_hdr(skb); if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } return ip_do_fragment(net, sk, skb, output); } static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE) return PPPOE_SES_HLEN; return 0; } static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); unsigned int mtu, mtu_reserved; mtu_reserved = nf_bridge_mtu_reduction(skb); mtu = skb->dev->mtu; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) mtu = nf_bridge->frag_max_size; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } /* Fragmentation on metadata/template dst is not supported */ if (unlikely(!skb_valid_dst(skb))) goto drop; /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) && skb->protocol == htons(ETH_P_IP)) { struct brnf_frag_data *data; if (br_validate_ipv4(net, skb)) goto drop; IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; data = this_cpu_ptr(&brnf_frag_data_storage); if (skb_vlan_tag_present(skb)) { data->vlan_tci = skb->vlan_tci; data->vlan_proto = skb->vlan_proto; } else { data->vlan_proto = 0; } data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); } if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); struct brnf_frag_data *data; if (br_validate_ipv6(net, skb)) goto drop; IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); if (v6ops) return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); kfree_skb(skb); return -EMSGSIZE; } nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); drop: kfree_skb(skb); return 0; } /* PF_BRIDGE/POST_ROUTING ********************************************/ static unsigned int br_nf_post_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *realoutdev = bridge_parent(skb->dev); u_int8_t pf; /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in * on a bridge, but was delivered locally and is now being routed: * * POST_ROUTING was already invoked from the ip stack. */ if (!nf_bridge || !nf_bridge->physoutdev) return NF_ACCEPT; if (!realoutdev) return NF_DROP; if (IS_IP(skb) || is_vlan_ip(skb, state->net) || is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } nf_bridge_pull_encap_header(skb); if (pf == NFPROTO_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); NF_HOOK(pf, NF_INET_POST_ROUTING, state->net, state->sk, skb, NULL, realoutdev, br_nf_dev_queue_xmit); return NF_STOLEN; } /* IP/SABOTAGE *****************************************************/ /* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING * for the second time. */ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge) { if (nf_bridge->sabotage_in_done) return NF_ACCEPT; if (!nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev) && !netif_is_l3_slave(skb->dev)) { nf_bridge->sabotage_in_done = 1; state->okfn(state->net, state->sk, skb); return NF_STOLEN; } } return NF_ACCEPT; } /* This is called when br_netfilter has called into iptables/netfilter, * and DNAT has taken place on a bridge-forwarded packet. * * neigh->output has created a new MAC header, with local br0 MAC * as saddr. * * This restores the original MAC saddr of the bridged packet * before invoking bridge forward logic to transmit the packet. */ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); skb_pull(skb, ETH_HLEN); nf_bridge->bridged_dnat = 0; BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), nf_bridge->neigh_header, ETH_HLEN - ETH_ALEN); skb->dev = nf_bridge->physindev; nf_bridge->physoutdev = NULL; br_handle_frame_finish(dev_net(skb->dev), NULL, skb); } static int br_nf_dev_xmit(struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge && nf_bridge->bridged_dnat) { br_nf_pre_routing_finish_bridge_slow(skb); return 1; } return 0; } static const struct nf_br_ops br_ops = { .br_dev_xmit_hook = br_nf_dev_xmit, }; /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * br_dev_queue_push_xmit is called afterwards */ static const struct nf_hook_ops br_nf_ops[] = { { .hook = br_nf_pre_routing, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_BRNF, }, #if IS_ENABLED(CONFIG_NF_CONNTRACK) { .hook = br_nf_local_in, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_LAST, }, #endif { .hook = br_nf_forward_ip, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_BRNF - 1, }, { .hook = br_nf_forward_arp, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_BRNF, }, { .hook = br_nf_post_routing, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_LAST, }, { .hook = ip_sabotage_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_FIRST, }, { .hook = ip_sabotage_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_FIRST, }, }; static int brnf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct brnf_net *brnet; struct net *net; int ret; if (event != NETDEV_REGISTER || !(dev->priv_flags & IFF_EBRIDGE)) return NOTIFY_DONE; ASSERT_RTNL(); net = dev_net(dev); brnet = net_generic(net, brnf_net_id); if (brnet->enabled) return NOTIFY_OK; ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); if (ret) return NOTIFY_BAD; brnet->enabled = true; return NOTIFY_OK; } static struct notifier_block brnf_notifier __read_mostly = { .notifier_call = brnf_device_event, }; /* recursively invokes nf_hook_slow (again), skipping already-called * hooks (< NF_BR_PRI_BRNF). * * Called with rcu read lock held. */ int br_nf_hook_thresh(unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { const struct nf_hook_entries *e; struct nf_hook_state state; struct nf_hook_ops **ops; unsigned int i; int ret; e = rcu_dereference(net->nf.hooks_bridge[hook]); if (!e) return okfn(net, sk, skb); ops = nf_hook_entries_get_hook_ops(e); for (i = 0; i < e->num_hook_entries; i++) { /* These hooks have already been called */ if (ops[i]->priority < NF_BR_PRI_BRNF) continue; /* These hooks have not been called yet, run them. */ if (ops[i]->priority > NF_BR_PRI_BRNF) break; /* take a closer look at NF_BR_PRI_BRNF. */ if (ops[i]->hook == br_nf_pre_routing) { /* This hook diverted the skb to this function, * hooks after this have not been run yet. */ i++; break; } } nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, e, i); if (ret == 1) ret = okfn(net, sk, skb); return ret; } #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *(int *)(ctl->data)) *(int *)(ctl->data) = 1; return ret; } static struct ctl_table brnf_table[] = { { .procname = "bridge-nf-call-arptables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-iptables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-ip6tables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-vlan-tagged", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-pppoe-tagged", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-pass-vlan-input-dev", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { } }; static inline void br_netfilter_sysctl_default(struct brnf_net *brnf) { brnf->call_iptables = 1; brnf->call_ip6tables = 1; brnf->call_arptables = 1; brnf->filter_vlan_tagged = 0; brnf->filter_pppoe_tagged = 0; brnf->pass_vlan_indev = 0; } static int br_netfilter_sysctl_init_net(struct net *net) { struct ctl_table *table = brnf_table; struct brnf_net *brnet; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL); if (!table) return -ENOMEM; } brnet = net_generic(net, brnf_net_id); table[0].data = &brnet->call_arptables; table[1].data = &brnet->call_iptables; table[2].data = &brnet->call_ip6tables; table[3].data = &brnet->filter_vlan_tagged; table[4].data = &brnet->filter_pppoe_tagged; table[5].data = &brnet->pass_vlan_indev; br_netfilter_sysctl_default(brnet); brnet->ctl_hdr = register_net_sysctl(net, "net/bridge", table); if (!brnet->ctl_hdr) { if (!net_eq(net, &init_net)) kfree(table); return -ENOMEM; } return 0; } static void br_netfilter_sysctl_exit_net(struct net *net, struct brnf_net *brnet) { struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(brnet->ctl_hdr); if (!net_eq(net, &init_net)) kfree(table); } static int __net_init brnf_init_net(struct net *net) { return br_netfilter_sysctl_init_net(net); } #endif static void __net_exit brnf_exit_net(struct net *net) { struct brnf_net *brnet; brnet = net_generic(net, brnf_net_id); if (brnet->enabled) { nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); brnet->enabled = false; } #ifdef CONFIG_SYSCTL br_netfilter_sysctl_exit_net(net, brnet); #endif } static struct pernet_operations brnf_net_ops __read_mostly = { #ifdef CONFIG_SYSCTL .init = brnf_init_net, #endif .exit = brnf_exit_net, .id = &brnf_net_id, .size = sizeof(struct brnf_net), }; static int __init br_netfilter_init(void) { int ret; ret = register_pernet_subsys(&brnf_net_ops); if (ret < 0) return ret; ret = register_netdevice_notifier(&brnf_notifier); if (ret < 0) { unregister_pernet_subsys(&brnf_net_ops); return ret; } RCU_INIT_POINTER(nf_br_ops, &br_ops); printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; } static void __exit br_netfilter_fini(void) { RCU_INIT_POINTER(nf_br_ops, NULL); unregister_netdevice_notifier(&brnf_notifier); unregister_pernet_subsys(&brnf_net_ops); } module_init(br_netfilter_init); module_exit(br_netfilter_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge"); |
16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark); u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming); int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message); struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); void batadv_tt_global_entry_release(struct kref *ref); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, const u8 *src, const u8 *addr, unsigned short vid); void batadv_tt_free(struct batadv_priv *bat_priv); bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, unsigned short vid); void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv); bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface); bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid); bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); int batadv_tt_cache_init(void); void batadv_tt_cache_destroy(void); /** * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and * possibly release it * @tt_global_entry: tt_global_entry to be free'd */ static inline void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) { if (!tt_global_entry) return; kref_put(&tt_global_entry->common.refcount, batadv_tt_global_entry_release); } #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */ |
50 50 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 | // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/percpu.h> #include <linux/netdevice.h> #include <linux/security.h> #include <net/net_namespace.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #ifdef CONFIG_LWTUNNEL #include <net/netfilter/nf_hooks_lwtunnel.h> #endif #include <linux/rculist_nulls.h> static bool enable_hooks __read_mostly; MODULE_PARM_DESC(enable_hooks, "Always enable conntrack hooks"); module_param(enable_hooks, bool, 0000); unsigned int nf_conntrack_net_id __read_mostly; #ifdef CONFIG_NF_CONNTRACK_PROCFS void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *l4proto) { switch (tuple->src.l3num) { case NFPROTO_IPV4: seq_printf(s, "src=%pI4 dst=%pI4 ", &tuple->src.u3.ip, &tuple->dst.u3.ip); break; case NFPROTO_IPV6: seq_printf(s, "src=%pI6 dst=%pI6 ", tuple->src.u3.ip6, tuple->dst.u3.ip6); break; default: break; } switch (l4proto->l4proto) { case IPPROTO_ICMP: seq_printf(s, "type=%u code=%u id=%u ", tuple->dst.u.icmp.type, tuple->dst.u.icmp.code, ntohs(tuple->src.u.icmp.id)); break; case IPPROTO_TCP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.tcp.port), ntohs(tuple->dst.u.tcp.port)); break; case IPPROTO_UDPLITE: case IPPROTO_UDP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.udp.port), ntohs(tuple->dst.u.udp.port)); break; case IPPROTO_DCCP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.dccp.port), ntohs(tuple->dst.u.dccp.port)); break; case IPPROTO_SCTP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.sctp.port), ntohs(tuple->dst.u.sctp.port)); break; case IPPROTO_ICMPV6: seq_printf(s, "type=%u code=%u id=%u ", tuple->dst.u.icmp.type, tuple->dst.u.icmp.code, ntohs(tuple->src.u.icmp.id)); break; case IPPROTO_GRE: seq_printf(s, "srckey=0x%x dstkey=0x%x ", ntohs(tuple->src.u.gre.key), ntohs(tuple->dst.u.gre.key)); break; default: break; } } EXPORT_SYMBOL_GPL(print_tuple); struct ct_iter_state { struct seq_net_private p; struct hlist_nulls_head *hash; unsigned int htable_size; unsigned int bucket; u_int64_t time_now; }; static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; st->bucket < st->htable_size; st->bucket++) { n = rcu_dereference( hlist_nulls_first_rcu(&st->hash[st->bucket])); if (!is_a_nulls(n)) return n; } return NULL; } static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { if (++st->bucket >= st->htable_size) return NULL; } head = rcu_dereference( hlist_nulls_first_rcu(&st->hash[st->bucket])); } return head; } static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) { struct hlist_nulls_node *head = ct_get_first(seq); if (head) while (pos && (head = ct_get_next(seq, head))) pos--; return pos ? NULL : head; } static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ct_iter_state *st = seq->private; st->time_now = ktime_get_real_ns(); rcu_read_lock(); nf_conntrack_get_ht(&st->hash, &st->htable_size); return ct_get_idx(seq, *pos); } static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; return ct_get_next(s, v); } static void ct_seq_stop(struct seq_file *s, void *v) __releases(RCU) { rcu_read_unlock(); } #ifdef CONFIG_NF_CONNTRACK_SECMARK static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { int ret; u32 len; char *secctx; ret = security_secid_to_secctx(ct->secmark, &secctx, &len); if (ret) return; seq_printf(s, "secctx=%s ", secctx); security_release_secctx(secctx, len); } #else static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { } #endif #ifdef CONFIG_NF_CONNTRACK_ZONES static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, int dir) { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); if (zone->dir != dir) return; switch (zone->dir) { case NF_CT_DEFAULT_ZONE_DIR: seq_printf(s, "zone=%u ", zone->id); break; case NF_CT_ZONE_DIR_ORIG: seq_printf(s, "zone-orig=%u ", zone->id); break; case NF_CT_ZONE_DIR_REPL: seq_printf(s, "zone-reply=%u ", zone->id); break; default: break; } } #else static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, int dir) { } #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { struct ct_iter_state *st = s->private; struct nf_conn_tstamp *tstamp; s64 delta_time; tstamp = nf_conn_tstamp_find(ct); if (tstamp) { delta_time = st->time_now - tstamp->start; if (delta_time > 0) delta_time = div_s64(delta_time, NSEC_PER_SEC); else delta_time = 0; seq_printf(s, "delta-time=%llu ", (unsigned long long)delta_time); } return; } #else static inline void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { } #endif static const char* l3proto_name(u16 proto) { switch (proto) { case AF_INET: return "ipv4"; case AF_INET6: return "ipv6"; } return "unknown"; } static const char* l4proto_name(u16 proto) { switch (proto) { case IPPROTO_ICMP: return "icmp"; case IPPROTO_TCP: return "tcp"; case IPPROTO_UDP: return "udp"; case IPPROTO_DCCP: return "dccp"; case IPPROTO_GRE: return "gre"; case IPPROTO_SCTP: return "sctp"; case IPPROTO_UDPLITE: return "udplite"; case IPPROTO_ICMPV6: return "icmpv6"; } return "unknown"; } static unsigned int seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) { struct nf_conn_acct *acct; struct nf_conn_counter *counter; acct = nf_conn_acct_find(ct); if (!acct) return 0; counter = acct->counter; seq_printf(s, "packets=%llu bytes=%llu ", (unsigned long long)atomic64_read(&counter[dir].packets), (unsigned long long)atomic64_read(&counter[dir].bytes)); return 0; } /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); const struct nf_conntrack_l4proto *l4proto; struct net *net = seq_file_net(s); int ret = 0; WARN_ON(!ct); if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use))) return 0; if (nf_ct_should_gc(ct)) { nf_ct_kill(ct); goto release; } /* we only want to print DIR_ORIGINAL */ if (NF_CT_DIRECTION(hash)) goto release; if (!net_eq(nf_ct_net(ct), net)) goto release; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); ret = -ENOSPC; seq_printf(s, "%-8s %u %-8s %u ", l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct), l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ); if (l4proto->print_conntrack) l4proto->print_conntrack(s, ct); print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); if (seq_has_overflowed(s)) goto release; if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) goto release; if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) seq_puts(s, "[UNREPLIED] "); print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[HW_OFFLOAD] "); else if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[OFFLOAD] "); else if (test_bit(IPS_ASSURED_BIT, &ct->status)) seq_puts(s, "[ASSURED] "); if (seq_has_overflowed(s)) goto release; #if defined(CONFIG_NF_CONNTRACK_MARK) seq_printf(s, "mark=%u ", READ_ONCE(ct->mark)); #endif ct_show_secctx(s, ct); ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use)); if (seq_has_overflowed(s)) goto release; ret = 0; release: nf_ct_put(ct); return ret; } static const struct seq_operations ct_seq_ops = { .start = ct_seq_start, .next = ct_seq_next, .stop = ct_seq_stop, .show = ct_seq_show }; static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } return NULL; } static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } (*pos)++; return NULL; } static void ct_cpu_seq_stop(struct seq_file *seq, void *v) { } static int ct_cpu_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); const struct ip_conntrack_stat *st = v; unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } nr_conntracks = nf_conntrack_count(net); seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, st->clash_resolve, st->found, 0, st->invalid, 0, 0, st->chaintoolong, st->insert, st->insert_failed, st->drop, st->early_drop, st->error, st->expect_new, st->expect_create, st->expect_delete, st->search_restart ); return 0; } static const struct seq_operations ct_cpu_seq_ops = { .start = ct_cpu_seq_start, .next = ct_cpu_seq_next, .stop = ct_cpu_seq_stop, .show = ct_cpu_seq_show, }; static int nf_conntrack_standalone_init_proc(struct net *net) { struct proc_dir_entry *pde; kuid_t root_uid; kgid_t root_gid; pde = proc_create_net("nf_conntrack", 0440, net->proc_net, &ct_seq_ops, sizeof(struct ct_iter_state)); if (!pde) goto out_nf_conntrack; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(pde, root_uid, root_gid); pde = proc_create_net("nf_conntrack", 0444, net->proc_net_stat, &ct_cpu_seq_ops, sizeof(struct seq_net_private)); if (!pde) goto out_stat_nf_conntrack; return 0; out_stat_nf_conntrack: remove_proc_entry("nf_conntrack", net->proc_net); out_nf_conntrack: return -ENOMEM; } static void nf_conntrack_standalone_fini_proc(struct net *net) { remove_proc_entry("nf_conntrack", net->proc_net_stat); remove_proc_entry("nf_conntrack", net->proc_net); } #else static int nf_conntrack_standalone_init_proc(struct net *net) { return 0; } static void nf_conntrack_standalone_fini_proc(struct net *net) { } #endif /* CONFIG_NF_CONNTRACK_PROCFS */ u32 nf_conntrack_count(const struct net *net) { const struct nf_conntrack_net *cnet = nf_ct_pernet(net); return atomic_read(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_count); /* Sysctl support */ #ifdef CONFIG_SYSCTL /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; /* module_param hashsize could have changed value */ nf_conntrack_htable_size_user = nf_conntrack_htable_size; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret < 0 || !write) return ret; /* update ret, we might not be able to satisfy request */ ret = nf_conntrack_hash_resize(nf_conntrack_htable_size_user); /* update it to the actual value used by conntrack */ nf_conntrack_htable_size_user = nf_conntrack_htable_size; return ret; } static struct ctl_table_header *nf_ct_netfilter_header; enum nf_ct_sysctl_index { NF_SYSCTL_CT_MAX, NF_SYSCTL_CT_COUNT, NF_SYSCTL_CT_BUCKETS, NF_SYSCTL_CT_CHECKSUM, NF_SYSCTL_CT_LOG_INVALID, NF_SYSCTL_CT_EXPECT_MAX, NF_SYSCTL_CT_ACCT, NF_SYSCTL_CT_HELPER, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_SYSCTL_CT_EVENTS, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP NF_SYSCTL_CT_TIMESTAMP, #endif NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD, #endif NF_SYSCTL_CT_PROTO_TCP_LOOSE, NF_SYSCTL_CT_PROTO_TCP_LIBERAL, NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST, NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD, #endif NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP, NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6, #ifdef CONFIG_NF_CT_PROTO_SCTP NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING, NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT, NF_SYSCTL_CT_PROTO_DCCP_LOOSE, #endif #ifdef CONFIG_NF_CT_PROTO_GRE NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif #ifdef CONFIG_LWTUNNEL NF_SYSCTL_CT_LWTUNNEL, #endif __NF_SYSCTL_CT_LAST_SYSCTL, }; #define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1) static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, [NF_SYSCTL_CT_BUCKETS] = { .procname = "nf_conntrack_buckets", .data = &nf_conntrack_htable_size_user, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = nf_conntrack_hash_sysctl, }, [NF_SYSCTL_CT_CHECKSUM] = { .procname = "nf_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", .data = &init_net.ct.sysctl_acct, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_HELPER] = { .procname = "nf_conntrack_helper", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", .data = &init_net.ct.sysctl_events, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP [NF_SYSCTL_CT_TIMESTAMP] = { .procname = "nf_conntrack_timestamp", .data = &init_net.ct.sysctl_tstamp, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC] = { .procname = "nf_conntrack_generic_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT] = { .procname = "nf_conntrack_tcp_timeout_syn_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV] = { .procname = "nf_conntrack_tcp_timeout_syn_recv", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED] = { .procname = "nf_conntrack_tcp_timeout_established", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT] = { .procname = "nf_conntrack_tcp_timeout_fin_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT] = { .procname = "nf_conntrack_tcp_timeout_close_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK] = { .procname = "nf_conntrack_tcp_timeout_last_ack", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT] = { .procname = "nf_conntrack_tcp_timeout_time_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE] = { .procname = "nf_conntrack_tcp_timeout_close", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS] = { .procname = "nf_conntrack_tcp_timeout_max_retrans", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK] = { .procname = "nf_conntrack_tcp_timeout_unacknowledged", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = { .procname = "nf_flowtable_tcp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { .procname = "nf_conntrack_tcp_be_liberal", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = { .procname = "nf_conntrack_tcp_ignore_invalid_rst", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = { .procname = "nf_conntrack_udp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM] = { .procname = "nf_conntrack_udp_timeout_stream", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = { .procname = "nf_flowtable_udp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = { .procname = "nf_conntrack_icmp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6] = { .procname = "nf_conntrack_icmpv6_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #ifdef CONFIG_NF_CT_PROTO_SCTP [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED] = { .procname = "nf_conntrack_sctp_timeout_closed", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT] = { .procname = "nf_conntrack_sctp_timeout_cookie_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED] = { .procname = "nf_conntrack_sctp_timeout_cookie_echoed", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED] = { .procname = "nf_conntrack_sctp_timeout_established", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .procname = "nf_conntrack_sctp_timeout_shutdown_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .procname = "nf_conntrack_sctp_timeout_shutdown_recd", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .procname = "nf_conntrack_sctp_timeout_heartbeat_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { .procname = "nf_conntrack_dccp_timeout_request", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND] = { .procname = "nf_conntrack_dccp_timeout_respond", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN] = { .procname = "nf_conntrack_dccp_timeout_partopen", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN] = { .procname = "nf_conntrack_dccp_timeout_open", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ] = { .procname = "nf_conntrack_dccp_timeout_closereq", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING] = { .procname = "nf_conntrack_dccp_timeout_closing", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT] = { .procname = "nf_conntrack_dccp_timeout_timewait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = { .procname = "nf_conntrack_dccp_loose", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_NF_CT_PROTO_GRE [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = { .procname = "nf_conntrack_gre_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM] = { .procname = "nf_conntrack_gre_timeout_stream", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif #ifdef CONFIG_LWTUNNEL [NF_SYSCTL_CT_LWTUNNEL] = { .procname = "nf_hooks_lwtunnel", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = nf_hooks_lwtunnel_sysctl_handler, }, #endif {} }; static struct ctl_table nf_ct_netfilter_table[] = { { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, struct ctl_table *table) { struct nf_tcp_net *tn = nf_tcp_pernet(net); #define XASSIGN(XNAME, tn) \ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ ## XNAME].data = \ &(tn)->timeouts[TCP_CONNTRACK_ ## XNAME] XASSIGN(SYN_SENT, tn); XASSIGN(SYN_RECV, tn); XASSIGN(ESTABLISHED, tn); XASSIGN(FIN_WAIT, tn); XASSIGN(CLOSE_WAIT, tn); XASSIGN(LAST_ACK, tn); XASSIGN(TIME_WAIT, tn); XASSIGN(CLOSE, tn); XASSIGN(RETRANS, tn); XASSIGN(UNACK, tn); #undef XASSIGN #define XASSIGN(XNAME, rval) \ table[NF_SYSCTL_CT_PROTO_TCP_ ## XNAME].data = (rval) XASSIGN(LOOSE, &tn->tcp_loose); XASSIGN(LIBERAL, &tn->tcp_be_liberal); XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans); XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst); #undef XASSIGN #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout; #endif } static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, struct ctl_table *table) { #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net *sn = nf_sctp_pernet(net); #define XASSIGN(XNAME, sn) \ table[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ ## XNAME].data = \ &(sn)->timeouts[SCTP_CONNTRACK_ ## XNAME] XASSIGN(CLOSED, sn); XASSIGN(COOKIE_WAIT, sn); XASSIGN(COOKIE_ECHOED, sn); XASSIGN(ESTABLISHED, sn); XASSIGN(SHUTDOWN_SENT, sn); XASSIGN(SHUTDOWN_RECD, sn); XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); #undef XASSIGN #endif } static void nf_conntrack_standalone_init_dccp_sysctl(struct net *net, struct ctl_table *table) { #ifdef CONFIG_NF_CT_PROTO_DCCP struct nf_dccp_net *dn = nf_dccp_pernet(net); #define XASSIGN(XNAME, dn) \ table[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_ ## XNAME].data = \ &(dn)->dccp_timeout[CT_DCCP_ ## XNAME] XASSIGN(REQUEST, dn); XASSIGN(RESPOND, dn); XASSIGN(PARTOPEN, dn); XASSIGN(OPEN, dn); XASSIGN(CLOSEREQ, dn); XASSIGN(CLOSING, dn); XASSIGN(TIMEWAIT, dn); #undef XASSIGN table[NF_SYSCTL_CT_PROTO_DCCP_LOOSE].data = &dn->dccp_loose; #endif } static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, struct ctl_table *table) { #ifdef CONFIG_NF_CT_PROTO_GRE struct nf_gre_net *gn = nf_gre_pernet(net); table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE].data = &gn->timeouts[GRE_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM].data = &gn->timeouts[GRE_CT_REPLIED]; #endif } static int nf_conntrack_standalone_init_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; BUILD_BUG_ON(ARRAY_SIZE(nf_ct_sysctl_table) != NF_SYSCTL_CT_LAST_SYSCTL); table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), GFP_KERNEL); if (!table) return -ENOMEM; table[NF_SYSCTL_CT_COUNT].data = &cnet->count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; table[NF_SYSCTL_CT_HELPER].data = &cnet->sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP table[NF_SYSCTL_CT_TIMESTAMP].data = &net->ct.sysctl_tstamp; #endif table[NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC].data = &nf_generic_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP].data = &nf_icmp_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED]; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout; #endif nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); nf_conntrack_standalone_init_dccp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); /* Don't allow non-init_net ns to alter global sysctls */ if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_MAX].mode = 0444; table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!cnet->sysctl_header) goto out_unregister_netfilter; return 0; out_unregister_netfilter: kfree(table); return -ENOMEM; } static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; unregister_net_sysctl_table(cnet->sysctl_header); kfree(table); } #else static int nf_conntrack_standalone_init_sysctl(struct net *net) { return 0; } static void nf_conntrack_standalone_fini_sysctl(struct net *net) { } #endif /* CONFIG_SYSCTL */ static void nf_conntrack_fini_net(struct net *net) { if (enable_hooks) nf_ct_netns_put(net, NFPROTO_INET); nf_conntrack_standalone_fini_proc(net); nf_conntrack_standalone_fini_sysctl(net); } static int nf_conntrack_pernet_init(struct net *net) { int ret; net->ct.sysctl_checksum = 1; ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) return ret; ret = nf_conntrack_standalone_init_proc(net); if (ret < 0) goto out_proc; ret = nf_conntrack_init_net(net); if (ret < 0) goto out_init_net; if (enable_hooks) { ret = nf_ct_netns_get(net, NFPROTO_INET); if (ret < 0) goto out_hooks; } return 0; out_hooks: nf_conntrack_cleanup_net(net); out_init_net: nf_conntrack_standalone_fini_proc(net); out_proc: nf_conntrack_standalone_fini_sysctl(net); return ret; } static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) { struct net *net; list_for_each_entry(net, net_exit_list, exit_list) nf_conntrack_fini_net(net); nf_conntrack_cleanup_net_list(net_exit_list); } static struct pernet_operations nf_conntrack_net_ops = { .init = nf_conntrack_pernet_init, .exit_batch = nf_conntrack_pernet_exit, .id = &nf_conntrack_net_id, .size = sizeof(struct nf_conntrack_net), }; static int __init nf_conntrack_standalone_init(void) { int ret = nf_conntrack_init_start(); if (ret < 0) goto out_start; BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER); #ifdef CONFIG_SYSCTL nf_ct_netfilter_header = register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); if (!nf_ct_netfilter_header) { pr_err("nf_conntrack: can't register to sysctl.\n"); ret = -ENOMEM; goto out_sysctl; } nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif nf_conntrack_init_end(); ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out_pernet; return 0; out_pernet: #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(nf_ct_netfilter_header); out_sysctl: #endif nf_conntrack_cleanup_end(); out_start: return ret; } static void __exit nf_conntrack_standalone_fini(void) { nf_conntrack_cleanup_start(); unregister_pernet_subsys(&nf_conntrack_net_ops); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(nf_ct_netfilter_header); #endif nf_conntrack_cleanup_end(); } module_init(nf_conntrack_standalone_init); module_exit(nf_conntrack_standalone_fini); |
142 8506 6823 6381 5687 4531 1448 214 214 381 379 44 6 2 24 25 97 140 105 1642 990 213 58 1033 2541 4007 510 75 38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/types.h> #include <linux/stddef.h> #include <linux/poison.h> #include <linux/const.h> #include <linux/kernel.h> /* * Circular doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) /** * INIT_LIST_HEAD - Initialize a list_head structure * @list: list_head structure to be initialized. * * Initializes the list_head to point to itself. If it is a list header, * the result is an empty list. */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } #ifdef CONFIG_DEBUG_LIST extern bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next); extern bool __list_del_entry_valid(struct list_head *entry); #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /* * Delete a list entry and clear the 'prev' pointer. * * This is a special-purpose list clearing method used in the networking code * for lists allocated as per-cpu, where we don't want to incur the extra * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this * needs to check the node 'prev' pointer instead of calling list_empty(). */ static inline void __list_del_clearprev(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = NULL; } static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_replace_init - replace old entry by new one and initialize the old one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_del(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add(entry1, pos); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_bulk_move_tail - move a subsection of a list to its tail * @head: the head that will follow our entry * @first: first entry to move * @last: last entry to move, can be the same as first * * Move all entries between @first and including @last before @head. * All three entries must belong to the same linked list. */ static inline void list_bulk_move_tail(struct list_head *head, struct list_head *first, struct list_head *last) { first->prev->next = last->next; last->next->prev = first->prev; head->prev->next = first; first->prev = head->prev; last->next = head; head->prev = last; } /** * list_is_first -- tests whether @list is the first entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_is_head - tests whether @list is the list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_head(const struct list_head *list, const struct list_head *head) { return list == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_del_init_careful - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. * * This is the same as list_del_init(), except designed to be used * together with list_empty_careful() in a way to guarantee ordering * of other memory operations. * * Any memory operations done before a list_del_init_careful() are * guaranteed to be visible after a list_empty_careful() test. */ static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_rotate_to_front() - Rotate list to specific item. * @list: The desired new front of the list. * @head: The head of the list. * * Rotates list so that @list becomes the new front of the list. */ static inline void list_rotate_to_front(struct list_head *list, struct list_head *head) { /* * Deletes the list head from the list denoted by @head and * places it as the tail of @list, this effectively rotates the * list so that @list is at the front. */ list_move_tail(head, list); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_rcu - Iterate over a list in an RCU-safe fashion * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_rcu(pos, head) \ for (pos = rcu_dereference((head)->next); \ !list_is_head(pos, (head)); \ pos = rcu_dereference(pos->next)) /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; \ !list_is_head(pos, (head)); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ (&pos->member == (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } /** * hlist_unhashed - Has node been removed from list and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed * state. For example, hlist_nulls_del_init_rcu() does leave the * node in unhashed state, but hlist_nulls_del() does not. */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } /** * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use * @h: Node to be checked * * This variant of hlist_unhashed() must be used in lockless contexts * to avoid potential load-tearing. The READ_ONCE() is paired with the * various WRITE_ONCE() in hlist helpers that are defined below. */ static inline int hlist_unhashed_lockless(const struct hlist_node *h) { return !READ_ONCE(h->pprev); } /** * hlist_empty - Is the specified hlist_head structure an empty hlist? * @h: Structure to check. */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (next) WRITE_ONCE(next->pprev, pprev); } /** * hlist_del - Delete the specified hlist_node from its list * @n: Node to delete. * * Note that this function leaves the node in hashed state. Use * hlist_del_init() or similar instead to unhash @n. */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } /** * hlist_del_init - Delete the specified hlist_node from its list and initialize * @n: Node to delete. * * Note that this function leaves the node in unhashed state. */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } /** * hlist_add_head - add a new entry at the beginning of the hlist * @n: new entry to be added * @h: hlist head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; WRITE_ONCE(n->next, first); if (first) WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); WRITE_ONCE(n->pprev, &h->first); } /** * hlist_add_before - add a new entry before the one specified * @n: new entry to be added * @next: hlist node to add it before, which must be non-NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); WRITE_ONCE(n->next, next); WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } /** * hlist_add_behind - add a new entry after the one specified * @n: new entry to be added * @prev: hlist node to add it after, which must be non-NULL */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { WRITE_ONCE(n->next, prev->next); WRITE_ONCE(prev->next, n); WRITE_ONCE(n->pprev, &prev->next); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } /** * hlist_add_fake - create a fake hlist consisting of a single headless node * @n: Node to make a fake list out of * * This makes @n appear to be its own predecessor on a headless hlist. * The point of this is to allow things like hlist_del() to work correctly * in cases where there is no list. */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } /** * hlist_fake: Is this node a fake hlist? * @h: Node to check for being a self-referential fake hlist. */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /** * hlist_is_singular_node - is node the only element of the specified hlist? * @n: Node to check for singularity. * @h: Header for potentially singular list. * * Check whether the node is the only node of the head without * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) { return !n->next && n->pprev == &h->first; } /** * hlist_move_list - Move an hlist * @old: hlist_head for old list. * @new: hlist_head for new list. * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: a &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) #endif |
86 81 75 85 86 86 86 81 81 81 81 86 81 81 81 75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 | #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/pagemap.h> #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/swap.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #ifndef CONFIG_MMU_GATHER_NO_GATHER static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; batch = tlb->active; if (batch->next) { tlb->active = batch->next; return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return false; tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; tlb->active->next = batch; tlb->active = batch; return true; } static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; } tlb->active = &tlb->local; } static void tlb_batch_list_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch, *next; for (batch = tlb->local.next; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); } tlb->local.next = NULL; } bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); #endif batch = tlb->active; /* * Add the page and check if we are full. If so * force a flush. */ batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } VM_BUG_ON_PAGE(batch->nr > batch->max, page); return false; } #endif /* MMU_GATHER_NO_GATHER */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE static void __tlb_remove_table_free(struct mmu_table_batch *batch) { int i; for (i = 0; i < batch->nr; i++) __tlb_remove_table(batch->tables[i]); free_page((unsigned long)batch); } #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * Semi RCU freeing of the page directories. * * This is needed by some architectures to implement software pagetable walkers. * * gup_fast() and other software pagetable walkers do a lockless page-table * walk and therefore needs some synchronization with the freeing of the page * directories. The chosen means to accomplish that is by disabling IRQs over * the walk. * * Architectures that use IPIs to flush TLBs will then automagically DTRT, * since we unlink the page, flush TLBs, free the page. Since the disabling of * IRQs delays the completion of the TLB flush we can never observe an already * freed page. * * Architectures that do not have this (PPC) need to delay the freeing by some * other means, this is that means. * * What we do is batch the freed directory pages (tables) and RCU free them. * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling * holds off grace periods. * * However, in order to batch these pages we need to allocate storage, this * allocation is deep inside the MM code and can thus easily fail on memory * pressure. To guarantee progress we fall back to single table freeing, see * the implementation of tlb_remove_table_one(). * */ static void tlb_remove_table_smp_sync(void *arg) { /* Simply deliver the interrupt */ } void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be * assumed to be actually RCU-freed. * * It is however sufficient for software page-table walkers that rely on * IRQ disabling. */ smp_call_function(tlb_remove_table_smp_sync, NULL, 1); } static void tlb_remove_table_rcu(struct rcu_head *head) { __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); } static void tlb_remove_table_free(struct mmu_table_batch *batch) { call_rcu(&batch->rcu, tlb_remove_table_rcu); } #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch); } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ /* * If we want tlb_remove_table() to imply TLB invalidates. */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { if (tlb_needs_table_invalidate()) { /* * Invalidate page-table caches used by hardware walkers. Then * we still need to RCU-sched wait while freeing the pages * because software walkers can still be in-flight. */ tlb_flush_mmu_tlbonly(tlb); } } static void tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } static void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; if (*batch) { tlb_table_invalidate(tlb); tlb_remove_table_free(*batch); *batch = NULL; } } void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { tlb_table_invalidate(tlb); tlb_remove_table_one(table); return; } (*batch)->nr = 0; } (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); } static inline void tlb_table_init(struct mmu_gather *tlb) { tlb->batch = NULL; } #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_table_flush(struct mmu_gather *tlb) { } static inline void tlb_table_init(struct mmu_gather *tlb) { } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ static void tlb_flush_mmu_free(struct mmu_gather *tlb) { tlb_table_flush(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_pages_flush(tlb); #endif } void tlb_flush_mmu(struct mmu_gather *tlb) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); } static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { tlb->mm = mm; tlb->fullmm = fullmm; #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb->need_flush_all = 0; tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; tlb->batch_count = 0; #endif tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } /** * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, false); } /** * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * In this case, @mm is without users and we're going to destroy the * full address space (exit/execve). * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, true); } /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish * * Called at the end of the shootdown operation to free up any resources that * were required. */ void tlb_finish_mmu(struct mmu_gather *tlb) { /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB * flush by batching, one thread may end up seeing inconsistent PTEs * and result in having stale TLB entries. So flush TLB forcefully * if we detect parallel PTE batching threads. * * However, some syscalls, e.g. munmap(), may free page tables, this * needs force flush everything in the given range. Otherwise this * may result in having stale TLB entries for some architectures, * e.g. aarch64, that could specify flush what level TLB. */ if (mm_tlb_flush_nested(tlb->mm)) { /* * The aarch64 yields better performance with fullmm by * avoiding multiple CPUs spamming TLBI messages at the * same time. * * On x86 non-fullmm doesn't yield significant difference * against fullmm. */ tlb->fullmm = 1; __tlb_reset_range(tlb); tlb->freed_tables = 1; } tlb_flush_mmu(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_list_free(tlb); #endif dec_tlb_flush_pending(tlb->mm); } |
4921 4921 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_COMPAT_H #define _ASM_X86_COMPAT_H /* * Architecture specific compatibility types */ #include <linux/types.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <asm/processor.h> #include <asm/user32.h> #include <asm/unistd.h> #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; #include <asm-generic/compat.h> #define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "i686\0\0" typedef u16 __compat_uid_t; typedef u16 __compat_gid_t; typedef u16 compat_dev_t; typedef u16 compat_nlink_t; typedef u16 compat_ipc_pid_t; typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { u32 st_dev; compat_ino_t st_ino; compat_mode_t st_mode; compat_nlink_t st_nlink; __compat_uid_t st_uid; __compat_gid_t st_gid; u32 st_rdev; u32 st_size; u32 st_blksize; u32 st_blocks; u32 st_atime; u32 st_atime_nsec; u32 st_mtime; u32 st_mtime_nsec; u32 st_ctime; u32 st_ctime_nsec; u32 __unused4; u32 __unused5; }; struct compat_flock { short l_type; short l_whence; compat_off_t l_start; compat_off_t l_len; compat_pid_t l_pid; }; #define F_GETLK64 12 /* using 'struct flock64' */ #define F_SETLK64 13 #define F_SETLKW64 14 /* * IA32 uses 4 byte alignment for 64 bit quantities, * so we need to pack this structure. */ struct compat_flock64 { short l_type; short l_whence; compat_loff_t l_start; compat_loff_t l_len; compat_pid_t l_pid; } __attribute__((packed)); struct compat_statfs { int f_type; int f_bsize; int f_blocks; int f_bfree; int f_bavail; int f_files; int f_ffree; compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; int f_flags; int f_spare[4]; }; #define COMPAT_RLIM_INFINITY 0xffffffff #define COMPAT_OFF_T_MAX 0x7fffffff struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; __compat_gid32_t gid; __compat_uid32_t cuid; __compat_gid32_t cgid; unsigned short mode; unsigned short __pad1; unsigned short seq; unsigned short __pad2; compat_ulong_t unused1; compat_ulong_t unused2; }; struct compat_semid64_ds { struct compat_ipc64_perm sem_perm; compat_ulong_t sem_otime; compat_ulong_t sem_otime_high; compat_ulong_t sem_ctime; compat_ulong_t sem_ctime_high; compat_ulong_t sem_nsems; compat_ulong_t __unused3; compat_ulong_t __unused4; }; struct compat_msqid64_ds { struct compat_ipc64_perm msg_perm; compat_ulong_t msg_stime; compat_ulong_t msg_stime_high; compat_ulong_t msg_rtime; compat_ulong_t msg_rtime_high; compat_ulong_t msg_ctime; compat_ulong_t msg_ctime_high; compat_ulong_t msg_cbytes; compat_ulong_t msg_qnum; compat_ulong_t msg_qbytes; compat_pid_t msg_lspid; compat_pid_t msg_lrpid; compat_ulong_t __unused4; compat_ulong_t __unused5; }; struct compat_shmid64_ds { struct compat_ipc64_perm shm_perm; compat_size_t shm_segsz; compat_ulong_t shm_atime; compat_ulong_t shm_atime_high; compat_ulong_t shm_dtime; compat_ulong_t shm_dtime_high; compat_ulong_t shm_ctime; compat_ulong_t shm_ctime_high; compat_pid_t shm_cpid; compat_pid_t shm_lpid; compat_ulong_t shm_nattch; compat_ulong_t __unused4; compat_ulong_t __unused5; }; #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) #endif static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) return true; #endif return false; } static inline bool in_32bit_syscall(void) { return in_ia32_syscall() || in_x32_syscall(); } #ifdef CONFIG_COMPAT static inline bool in_compat_syscall(void) { return in_32bit_syscall(); } #define in_compat_syscall in_compat_syscall /* override the generic impl */ #define compat_need_64bit_alignment_fixup in_ia32_syscall #endif struct compat_siginfo; #ifdef CONFIG_X86_X32_ABI int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginfo_t *from); #define copy_siginfo_to_user32 copy_siginfo_to_user32 #endif /* CONFIG_X86_X32_ABI */ #endif /* _ASM_X86_COMPAT_H */ |
3764 3620 8 25 1169 96 191 83 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_CRED_H #define _LINUX_CRED_H #include <linux/capability.h> #include <linux/init.h> #include <linux/key.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/sched.h> #include <linux/sched/user.h> struct cred; struct inode; /* * COW Supplementary groups list */ struct group_info { atomic_t usage; int ngroups; kgid_t gid[]; } __randomize_layout; /** * get_group_info - Get a reference to a group info structure * @group_info: The group info to reference * * This gets a reference to a set of supplementary groups. * * If the caller is accessing a task's credentials, they must hold the RCU read * lock when reading. */ static inline struct group_info *get_group_info(struct group_info *gi) { atomic_inc(&gi->usage); return gi; } /** * put_group_info - Release a reference to a group info structure * @group_info: The group info to release */ #define put_group_info(group_info) \ do { \ if (atomic_dec_and_test(&(group_info)->usage)) \ groups_free(group_info); \ } while (0) #ifdef CONFIG_MULTIUSER extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); extern int in_group_p(kgid_t); extern int in_egroup_p(kgid_t); extern int groups_search(const struct group_info *, kgid_t); extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern bool may_setgroups(void); extern void groups_sort(struct group_info *); #else static inline void groups_free(struct group_info *group_info) { } static inline int in_group_p(kgid_t grp) { return 1; } static inline int in_egroup_p(kgid_t grp) { return 1; } static inline int groups_search(const struct group_info *group_info, kgid_t grp) { return 1; } #endif /* * The security context of a task * * The parts of the context break down into two categories: * * (1) The objective context of a task. These parts are used when some other * task is attempting to affect this one. * * (2) The subjective context. These details are used when the task is acting * upon another object, be that a file, a task, a key or whatever. * * Note that some members of this structure belong to both categories - the * LSM security pointer for instance. * * A task has two security pointers. task->real_cred points to the objective * context that defines that task's actual details. The objective part of this * context is used whenever that task is acted upon. * * task->cred points to the subjective context that defines the details of how * that task is going to act upon another object. This may be overridden * temporarily to point to another security context, but normally points to the * same context as task->real_cred. */ struct cred { atomic_long_t usage; #ifdef CONFIG_DEBUG_CREDENTIALS atomic_t subscribers; /* number of processes subscribed */ void *put_addr; unsigned magic; #define CRED_MAGIC 0x43736564 #define CRED_MAGIC_DEAD 0x44656144 #endif kuid_t uid; /* real UID of the task */ kgid_t gid; /* real GID of the task */ kuid_t suid; /* saved UID of the task */ kgid_t sgid; /* saved GID of the task */ kuid_t euid; /* effective UID of the task */ kgid_t egid; /* effective GID of the task */ kuid_t fsuid; /* UID for VFS ops */ kgid_t fsgid; /* GID for VFS ops */ unsigned securebits; /* SUID-less security management */ kernel_cap_t cap_inheritable; /* caps our children can inherit */ kernel_cap_t cap_permitted; /* caps we're permitted */ kernel_cap_t cap_effective; /* caps we can actually use */ kernel_cap_t cap_bset; /* capability bounding set */ kernel_cap_t cap_ambient; /* Ambient capability set */ #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ struct key *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ #endif #ifdef CONFIG_SECURITY void *security; /* LSM security */ #endif struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct ucounts *ucounts; struct group_info *group_info; /* supplementary groups for euid/fsgid */ /* RCU deletion */ union { int non_rcu; /* Can we skip RCU deletion? */ struct rcu_head rcu; /* RCU deletion hook */ }; } __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); extern void revert_creds(const struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); extern int change_create_files_as(struct cred *, struct inode *); extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); extern int set_cred_ucounts(struct cred *); /* * check for validity of credentials */ #ifdef CONFIG_DEBUG_CREDENTIALS extern void __invalid_creds(const struct cred *, const char *, unsigned); extern void __validate_process_creds(struct task_struct *, const char *, unsigned); extern bool creds_are_invalid(const struct cred *cred); static inline void __validate_creds(const struct cred *cred, const char *file, unsigned line) { if (unlikely(creds_are_invalid(cred))) __invalid_creds(cred, file, line); } #define validate_creds(cred) \ do { \ __validate_creds((cred), __FILE__, __LINE__); \ } while(0) #define validate_process_creds() \ do { \ __validate_process_creds(current, __FILE__, __LINE__); \ } while(0) extern void validate_creds_for_do_exit(struct task_struct *); #else static inline void validate_creds(const struct cred *cred) { } static inline void validate_creds_for_do_exit(struct task_struct *tsk) { } static inline void validate_process_creds(void) { } #endif static inline bool cap_ambient_invariant_ok(const struct cred *cred) { return cap_issubset(cred->cap_ambient, cap_intersect(cred->cap_permitted, cred->cap_inheritable)); } /** * get_new_cred - Get a reference on a new set of credentials * @cred: The new credentials to reference * * Get a reference on the specified set of new credentials. The caller must * release the reference. */ static inline struct cred *get_new_cred(struct cred *cred) { atomic_long_inc(&cred->usage); return cred; } /** * get_cred - Get a reference on a set of credentials * @cred: The credentials to reference * * Get a reference on the specified set of credentials. The caller must * release the reference. If %NULL is passed, it is returned with no action. * * This is used to deal with a committed set of credentials. Although the * pointer is const, this will temporarily discard the const and increment the * usage count. The purpose of this is to attempt to catch at compile time the * accidental alteration of a set of credentials that should be considered * immutable. */ static inline const struct cred *get_cred(const struct cred *cred) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return cred; validate_creds(cred); nonconst_cred->non_rcu = 0; return get_new_cred(nonconst_cred); } static inline const struct cred *get_cred_rcu(const struct cred *cred) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return NULL; if (!atomic_long_inc_not_zero(&nonconst_cred->usage)) return NULL; validate_creds(cred); nonconst_cred->non_rcu = 0; return cred; } /** * put_cred - Release a reference to a set of credentials * @cred: The credentials to release * * Release a reference to a set of credentials, deleting them when the last ref * is released. If %NULL is passed, nothing is done. * * This takes a const pointer to a set of credentials because the credentials * on task_struct are attached by const pointers to prevent accidental * alteration of otherwise immutable credential sets. */ static inline void put_cred(const struct cred *_cred) { struct cred *cred = (struct cred *) _cred; if (cred) { validate_creds(cred); if (atomic_long_dec_and_test(&(cred)->usage)) __put_cred(cred); } } /** * current_cred - Access the current task's subjective credentials * * Access the subjective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_cred() \ rcu_dereference_protected(current->cred, 1) /** * current_real_cred - Access the current task's objective credentials * * Access the objective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_real_cred() \ rcu_dereference_protected(current->real_cred, 1) /** * __task_cred - Access a task's objective credentials * @task: The task to query * * Access the objective credentials of a task. The caller must hold the RCU * readlock. * * The result of this function should not be passed directly to get_cred(); * rather get_task_cred() should be used instead. */ #define __task_cred(task) \ rcu_dereference((task)->real_cred) /** * get_current_cred - Get the current task's subjective credentials * * Get the subjective credentials of the current task, pinning them so that * they can't go away. Accessing the current task's credentials directly is * not permitted. */ #define get_current_cred() \ (get_cred(current_cred())) /** * get_current_user - Get the current task's user_struct * * Get the user record of the current task, pinning it so that it can't go * away. */ #define get_current_user() \ ({ \ struct user_struct *__u; \ const struct cred *__cred; \ __cred = current_cred(); \ __u = get_uid(__cred->user); \ __u; \ }) /** * get_current_groups - Get the current task's supplementary group list * * Get the supplementary group list of the current task, pinning it so that it * can't go away. */ #define get_current_groups() \ ({ \ struct group_info *__groups; \ const struct cred *__cred; \ __cred = current_cred(); \ __groups = get_group_info(__cred->group_info); \ __groups; \ }) #define task_cred_xxx(task, xxx) \ ({ \ __typeof__(((struct cred *)NULL)->xxx) ___val; \ rcu_read_lock(); \ ___val = __task_cred((task))->xxx; \ rcu_read_unlock(); \ ___val; \ }) #define task_uid(task) (task_cred_xxx((task), uid)) #define task_euid(task) (task_cred_xxx((task), euid)) #define task_ucounts(task) (task_cred_xxx((task), ucounts)) #define current_cred_xxx(xxx) \ ({ \ current_cred()->xxx; \ }) #define current_uid() (current_cred_xxx(uid)) #define current_gid() (current_cred_xxx(gid)) #define current_euid() (current_cred_xxx(euid)) #define current_egid() (current_cred_xxx(egid)) #define current_suid() (current_cred_xxx(suid)) #define current_sgid() (current_cred_xxx(sgid)) #define current_fsuid() (current_cred_xxx(fsuid)) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) #define current_ucounts() (current_cred_xxx(ucounts)) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) #else static inline struct user_namespace *current_user_ns(void) { return &init_user_ns; } #endif #define current_uid_gid(_uid, _gid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_uid) = __cred->uid; \ *(_gid) = __cred->gid; \ } while(0) #define current_euid_egid(_euid, _egid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_euid) = __cred->euid; \ *(_egid) = __cred->egid; \ } while(0) #define current_fsuid_fsgid(_fsuid, _fsgid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_fsuid) = __cred->fsuid; \ *(_fsgid) = __cred->fsgid; \ } while(0) #endif /* _LINUX_CRED_H */ |
6 58 194 1 193 194 135 88 105 3 3 3 3 190 189 190 167 74 156 84 172 65 3 172 68 190 97 72 56 3 83 83 83 24 59 83 83 82 83 83 83 82 24 59 82 83 86 86 58 58 57 1 124 48 2 46 22 23 6 2 32 244 244 25 13 14 14 13 16 16 16 50 50 50 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 | // SPDX-License-Identifier: GPL-2.0-or-later /* * PF_INET6 socket protocol family * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Adapted from linux/net/ipv4/af_inet.c * * Fixes: * piggy, Karl Knutson : Socket protocol table * Hideaki YOSHIFUJI : sin6_scope_id support * Arnaldo Melo : check proc_net_create return, cleanups */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/icmpv6.h> #include <linux/netfilter_ipv6.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/udp.h> #include <net/udplite.h> #include <net/tcp.h> #include <net/ping.h> #include <net/protocol.h> #include <net/inet_common.h> #include <net/route.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ipv6_stubs.h> #include <net/ndisc.h> #ifdef CONFIG_IPV6_TUNNEL #include <net/ip6_tunnel.h> #endif #include <net/calipso.h> #include <net/seg6.h> #include <net/rpl.h> #include <net/compat.h> #include <net/xfrm.h> #include <net/ioam6.h> #include <linux/uaccess.h> #include <linux/mroute6.h> #include "ip6_offload.h" MODULE_AUTHOR("Cast of dozens"); MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); MODULE_LICENSE("GPL"); /* The inetsw6 table contains everything that inet6_create needs to * build a new socket. */ static struct list_head inetsw6[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw6_lock); struct ipv6_params ipv6_defaults = { .disable_ipv6 = 0, .autoconf = 1, }; static int disable_ipv6_mod; module_param_named(disable, disable_ipv6_mod, int, 0444); MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional"); module_param_named(disable_ipv6, ipv6_defaults.disable_ipv6, int, 0444); MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces"); module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444); MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces"); bool ipv6_mod_enabled(void) { return disable_ipv6_mod == 0; } EXPORT_SYMBOL_GPL(ipv6_mod_enabled); static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } void inet6_sock_destruct(struct sock *sk) { inet6_cleanup_sock(sk); inet_sock_destruct(sk); } EXPORT_SYMBOL_GPL(inet6_sock_destruct); static int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) { struct inet_sock *inet; struct ipv6_pinfo *np; struct sock *sk; struct inet_protosw *answer; struct proto *answer_prot; unsigned char answer_flags; int try_loading_module = 0; int err; if (protocol < 0 || protocol >= IPPROTO_MAX) return -EINVAL; /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) { err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; } if (err) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-10-proto-132-type-1 * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET6, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-10-proto-132 * (net-pf-PF_INET6-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET6, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (sock->type == SOCK_RAW && !kern && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; answer_flags = answer->flags; rcu_read_unlock(); WARN_ON(!answer_prot->slab); err = -ENOBUFS; sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; sock_init_data(sock, sk); err = 0; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; if (INET_PROTOSW_ICSK & answer_flags) inet_init_csk_locks(sk); inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } sk->sk_destruct = inet6_sock_destruct; sk->sk_family = PF_INET6; sk->sk_protocol = protocol; sk->sk_backlog_rcv = answer->prot->backlog_rcv; inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk); np->hop_limit = -1; np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. */ inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_index = 0; RCU_INIT_POINTER(inet->mc_list, NULL); inet->rcv_tos = 0; if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; /* * Increment only the relevant sk_prot->socks debug field, this changes * the previous behaviour of incrementing both the equivalent to * answer->prot->socks (inet6_sock_nr) and inet_sock_nr. * * This allows better debug granularity as we'll know exactly how many * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6 * transport protocol socks. -acme */ sk_refcnt_debug_inc(sk); if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically shares. */ inet->inet_sport = htons(inet->inet_num); err = sk->sk_prot->hash(sk); if (err) goto out_sk_release; } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) goto out_sk_release; } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); if (err) goto out_sk_release; } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; out_sk_release: sk_common_release(sk); sock->sk = NULL; goto out; } static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); __be32 v4addr = 0; unsigned short snum; bool saved_ipv6only; int addr_type = 0; int err = 0; if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; addr_type = ipv6_addr_type(&addr->sin6_addr); if ((addr_type & IPV6_ADDR_MULTICAST) && sk->sk_type == SOCK_STREAM) return -EINVAL; snum = ntohs(addr->sin6_port); if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ if (sk->sk_state != TCP_CLOSE || inet->inet_num) { err = -EINVAL; goto out; } /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { struct net_device *dev = NULL; int chk_addr_ret; /* Binding to v4-mapped address on a v6-only socket * makes no sense */ if (sk->sk_ipv6only) { err = -EINVAL; goto out; } rcu_read_lock(); if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; goto out_unlock; } } /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr); rcu_read_unlock(); if (!inet_can_nonlocal_bind(net, inet) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) { err = -EADDRNOTAVAIL; goto out; } } else { if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; rcu_read_lock(); if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another one * is supplied by user. */ sk->sk_bound_dev_if = addr->sin6_scope_id; } /* Binding to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { err = -EINVAL; goto out_unlock; } } if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; goto out_unlock; } } /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { if (!ipv6_can_nonlocal_bind(net, inet) && !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; goto out_unlock; } } rcu_read_unlock(); } } inet->inet_rcv_saddr = v4addr; inet->inet_saddr = v4addr; sk->sk_v6_rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) np->saddr = addr->sin6_addr; saved_ipv6only = sk->sk_ipv6only; if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED) sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { if (sk->sk_prot->get_port(sk, snum)) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); err = -EADDRINUSE; goto out; } if (!(flags & BIND_FROM_BPF)) { err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); if (err) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); goto out; } } } if (addr_type != IPV6_ADDR_ANY) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); inet->inet_dport = 0; inet->inet_daddr = 0; out: if (flags & BIND_WITH_LOCK) release_sock(sk); return err; out_unlock: rcu_read_unlock(); goto out; } /* bind for INET6 API */ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; u32 flags = BIND_WITH_LOCK; const struct proto *prot; int err = 0; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); /* If the socket has its own bind function then use it. */ if (prot->bind) return prot->bind(sk, uaddr, addr_len); if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, CGROUP_INET6_BIND, &flags); if (err) return err; return __inet6_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return -EINVAL; /* Free mc lists */ ipv6_sock_mc_close(sk); /* Free ac lists */ ipv6_sock_ac_close(sk); return inet_release(sock); } EXPORT_SYMBOL(inet6_release); void inet6_destroy_sock(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; struct ipv6_txoptions *opt; /* Release rx options */ skb = xchg(&np->pktoptions, NULL); kfree_skb(skb); skb = xchg(&np->rxpmtu, NULL); kfree_skb(skb); /* Free flowlabels */ fl6_free_socklist(sk); /* Free tx options */ opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL); if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } } EXPORT_SYMBOL_GPL(inet6_destroy_sock); void inet6_cleanup_sock(struct sock *sk) { inet6_destroy_sock(sk); } EXPORT_SYMBOL_GPL(inet6_cleanup_sock); /* * This does both peername and sockname. */ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_scope_id = 0; lock_sock(sk); if (peer) { if (!inet->inet_dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && peer == 1)) { release_sock(sk); return -ENOTCONN; } sin->sin6_port = inet->inet_dport; sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, CGROUP_INET6_GETPEERNAME); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, CGROUP_INET6_GETSOCKNAME); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); release_sock(sk); return sizeof(*sin); } EXPORT_SYMBOL(inet6_getname); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; struct net *net = sock_net(sk); const struct proto *prot; switch (cmd) { case SIOCADDRT: case SIOCDELRT: { struct in6_rtmsg rtmsg; if (copy_from_user(&rtmsg, argp, sizeof(rtmsg))) return -EFAULT; return ipv6_route_ioctl(net, cmd, &rtmsg); } case SIOCSIFADDR: return addrconf_add_ifaddr(net, argp); case SIOCDIFADDR: return addrconf_del_ifaddr(net, argp); case SIOCSIFDSTADDR: return addrconf_set_dstaddr(net, argp); default: /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); if (!prot->ioctl) return -ENOIOCTLCMD; return prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ return 0; } EXPORT_SYMBOL(inet6_ioctl); #ifdef CONFIG_COMPAT struct compat_in6_rtmsg { struct in6_addr rtmsg_dst; struct in6_addr rtmsg_src; struct in6_addr rtmsg_gateway; u32 rtmsg_type; u16 rtmsg_dst_len; u16 rtmsg_src_len; u32 rtmsg_metric; u32 rtmsg_info; u32 rtmsg_flags; s32 rtmsg_ifindex; }; static int inet6_compat_routing_ioctl(struct sock *sk, unsigned int cmd, struct compat_in6_rtmsg __user *ur) { struct in6_rtmsg rt; if (copy_from_user(&rt.rtmsg_dst, &ur->rtmsg_dst, 3 * sizeof(struct in6_addr)) || get_user(rt.rtmsg_type, &ur->rtmsg_type) || get_user(rt.rtmsg_dst_len, &ur->rtmsg_dst_len) || get_user(rt.rtmsg_src_len, &ur->rtmsg_src_len) || get_user(rt.rtmsg_metric, &ur->rtmsg_metric) || get_user(rt.rtmsg_info, &ur->rtmsg_info) || get_user(rt.rtmsg_flags, &ur->rtmsg_flags) || get_user(rt.rtmsg_ifindex, &ur->rtmsg_ifindex)) return -EFAULT; return ipv6_route_ioctl(sock_net(sk), cmd, &rt); } int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; switch (cmd) { case SIOCADDRT: case SIOCDELRT: return inet6_compat_routing_ioctl(sk, cmd, argp); default: return -ENOIOCTLCMD; } } EXPORT_SYMBOL_GPL(inet6_compat_ioctl); #endif /* CONFIG_COMPAT */ INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, size_t)); int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; const struct proto *prot; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); return INDIRECT_CALL_2(prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, sk, msg, size); } INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *, size_t, int, int, int *)); int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; const struct proto *prot; int addr_len = 0; int err; if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); err = INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_stream_connect, /* ok */ .socketpair = sock_no_socketpair, /* a do nothing */ .accept = inet_accept, /* ok */ .getname = inet6_getname, .poll = tcp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .gettstamp = sock_gettstamp, .listen = inet_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet6_sendmsg, /* retpoline's sake */ .recvmsg = inet6_recvmsg, /* retpoline's sake */ #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif .set_rcvlowat = tcp_set_rcvlowat, }; const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_dgram_connect, /* ok */ .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, .poll = udp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .gettstamp = sock_gettstamp, .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet6_sendmsg, /* retpoline's sake */ .recvmsg = inet6_recvmsg, /* retpoline's sake */ .read_sock = udp_read_sock, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static const struct net_proto_family inet6_family_ops = { .family = PF_INET6, .create = inet6_create, .owner = THIS_MODULE, }; int inet6_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; struct list_head *last_perm; int protocol = p->protocol; int ret; spin_lock_bh(&inetsw6_lock); ret = -EINVAL; if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ answer = NULL; ret = -EPERM; last_perm = &inetsw6[p->type]; list_for_each(lh, &inetsw6[p->type]) { answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */ if (INET_PROTOSW_PERMANENT & answer->flags) { if (protocol == answer->protocol) break; last_perm = lh; } answer = NULL; } if (answer) goto out_permanent; /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); ret = 0; out: spin_unlock_bh(&inetsw6_lock); return ret; out_permanent: pr_err("Attempt to override permanent protocol %d\n", protocol); goto out; out_illegal: pr_err("Ignoring attempt to register invalid socket type %d\n", p->type); goto out; } EXPORT_SYMBOL(inet6_register_protosw); void inet6_unregister_protosw(struct inet_protosw *p) { if (INET_PROTOSW_PERMANENT & p->flags) { pr_err("Attempt to unregister permanent protocol %d\n", p->protocol); } else { spin_lock_bh(&inetsw6_lock); list_del_rcu(&p->list); spin_unlock_bh(&inetsw6_lock); synchronize_net(); } } EXPORT_SYMBOL(inet6_unregister_protosw); int inet6_sk_rebuild_header(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct dst_entry *dst; dst = __sk_dst_check(sk, np->dst_cookie); if (!dst) { struct inet_sock *inet = inet_sk(sk); struct in6_addr *final_p, final; struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = sk->sk_protocol; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = np->saddr; fl6.flowlabel = np->flow_label; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; fl6.flowi6_uid = sk->sk_uid; security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { sk->sk_route_caps = 0; sk->sk_err_soft = -PTR_ERR(dst); return PTR_ERR(dst); } ip6_dst_store(sk, dst, NULL, NULL); } return 0; } EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt) { const struct ipv6_pinfo *np = inet6_sk(sk); if (np->rxopt.all) { if (((opt->flags & IP6SKB_HOPBYHOP) && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) && np->rxopt.bits.rxflow) || (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) return true; } return false; } EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .func = ipv6_rcv, .list_func = ipv6_list_rcv, }; static int __init ipv6_packet_init(void) { dev_add_pack(&ipv6_packet_type); return 0; } static void ipv6_packet_cleanup(void) { dev_remove_pack(&ipv6_packet_type); } static int __net_init ipv6_init_mibs(struct net *net) { int i; net->mib.udp_stats_in6 = alloc_percpu(struct udp_mib); if (!net->mib.udp_stats_in6) return -ENOMEM; net->mib.udplite_stats_in6 = alloc_percpu(struct udp_mib); if (!net->mib.udplite_stats_in6) goto err_udplite_mib; net->mib.ipv6_statistics = alloc_percpu(struct ipstats_mib); if (!net->mib.ipv6_statistics) goto err_ip_mib; for_each_possible_cpu(i) { struct ipstats_mib *af_inet6_stats; af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics, i); u64_stats_init(&af_inet6_stats->syncp); } net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib); if (!net->mib.icmpv6_statistics) goto err_icmp_mib; net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib), GFP_KERNEL); if (!net->mib.icmpv6msg_statistics) goto err_icmpmsg_mib; return 0; err_icmpmsg_mib: free_percpu(net->mib.icmpv6_statistics); err_icmp_mib: free_percpu(net->mib.ipv6_statistics); err_ip_mib: free_percpu(net->mib.udplite_stats_in6); err_udplite_mib: free_percpu(net->mib.udp_stats_in6); return -ENOMEM; } static void ipv6_cleanup_mibs(struct net *net) { free_percpu(net->mib.udp_stats_in6); free_percpu(net->mib.udplite_stats_in6); free_percpu(net->mib.ipv6_statistics); free_percpu(net->mib.icmpv6_statistics); kfree(net->mib.icmpv6msg_statistics); } static int __net_init inet6_net_init(struct net *net) { int err = 0; net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; /* By default, rate limit error messages. * Except for pmtu discovery, it would break it. * proc_do_large_bitmap needs pointer to the bitmap. */ bitmap_set(net->ipv6.sysctl.icmpv6_ratemask, 0, ICMPV6_ERRMSG_MAX + 1); bitmap_clear(net->ipv6.sysctl.icmpv6_ratemask, ICMPV6_PKT_TOOBIG, 1); net->ipv6.sysctl.icmpv6_ratemask_ptr = net->ipv6.sysctl.icmpv6_ratemask; net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; net->ipv6.sysctl.idgen_delay = 1 * HZ; net->ipv6.sysctl.flowlabel_state_ranges = 0; net->ipv6.sysctl.max_dst_opts_cnt = IP6_DEFAULT_MAX_DST_OPTS_CNT; net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT; net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN; net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN; net->ipv6.sysctl.fib_notify_on_flag_change = 0; atomic_set(&net->ipv6.fib6_sernum, 1); net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID; net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE; err = ipv6_init_mibs(net); if (err) return err; #ifdef CONFIG_PROC_FS err = udp6_proc_init(net); if (err) goto out; err = tcp6_proc_init(net); if (err) goto proc_tcp6_fail; err = ac6_proc_init(net); if (err) goto proc_ac6_fail; #endif return err; #ifdef CONFIG_PROC_FS proc_ac6_fail: tcp6_proc_exit(net); proc_tcp6_fail: udp6_proc_exit(net); out: ipv6_cleanup_mibs(net); return err; #endif } static void __net_exit inet6_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS udp6_proc_exit(net); tcp6_proc_exit(net); ac6_proc_exit(net); #endif ipv6_cleanup_mibs(net); } static struct pernet_operations inet6_net_ops = { .init = inet6_net_init, .exit = inet6_net_exit, }; static int ipv6_route_input(struct sk_buff *skb) { ip6_route_input(skb); return skb_dst(skb)->error; } static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_join = ipv6_sock_mc_join, .ipv6_sock_mc_drop = ipv6_sock_mc_drop, .ipv6_dst_lookup_flow = ip6_dst_lookup_flow, .ipv6_route_input = ipv6_route_input, .fib6_get_table = fib6_get_table, .fib6_table_lookup = fib6_table_lookup, .fib6_lookup = fib6_lookup, .fib6_select_path = fib6_select_path, .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .fib6_nh_init = fib6_nh_init, .fib6_nh_release = fib6_nh_release, .fib6_nh_release_dsts = fib6_nh_release_dsts, .fib6_update_sernum = fib6_update_sernum_stub, .fib6_rt_update = fib6_rt_update, .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, #if IS_ENABLED(CONFIG_XFRM) .xfrm6_local_rxpmtu = xfrm6_local_rxpmtu, .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv, .xfrm6_rcv_encap = xfrm6_rcv_encap, #endif .nd_tbl = &nd_tbl, .ipv6_fragment = ip6_fragment, .ipv6_dev_find = ipv6_dev_find, }; static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .inet6_bind = __inet6_bind, .udp6_lib_lookup = __udp6_lib_lookup, .ipv6_dev_get_saddr = ipv6_dev_get_saddr, }; static int __init inet6_init(void) { struct list_head *r; int err = 0; sock_skb_cb_check_size(sizeof(struct inet6_skb_parm)); /* Register the socket-side information for inet6_create. */ for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); if (disable_ipv6_mod) { pr_info("Loaded, but administratively disabled, reboot required to enable\n"); goto out; } err = proto_register(&tcpv6_prot, 1); if (err) goto out; err = proto_register(&udpv6_prot, 1); if (err) goto out_unregister_tcp_proto; err = proto_register(&udplitev6_prot, 1); if (err) goto out_unregister_udp_proto; err = proto_register(&rawv6_prot, 1); if (err) goto out_unregister_udplite_proto; err = proto_register(&pingv6_prot, 1); if (err) goto out_unregister_raw_proto; /* We MUST register RAW sockets before we create the ICMP6, * IGMP6, or NDISC control sockets. */ err = rawv6_init(); if (err) goto out_unregister_ping_proto; /* Register the family here so that the init calls below will * be able to create sockets. (?? is this dangerous ??) */ err = sock_register(&inet6_family_ops); if (err) goto out_sock_register_fail; /* * ipngwg API draft makes clear that the correct semantics * for TCP and UDP is to consider one TCP and UDP instance * in a host available by both INET and INET6 APIs and * able to communicate via both network protocols. */ err = register_pernet_subsys(&inet6_net_ops); if (err) goto register_pernet_fail; err = ip6_mr_init(); if (err) goto ipmr_fail; err = icmpv6_init(); if (err) goto icmp_fail; err = ndisc_init(); if (err) goto ndisc_fail; err = igmp6_init(); if (err) goto igmp_fail; err = ipv6_netfilter_init(); if (err) goto netfilter_fail; /* Create /proc/foo6 entries. */ #ifdef CONFIG_PROC_FS err = -ENOMEM; if (raw6_proc_init()) goto proc_raw6_fail; if (udplite6_proc_init()) goto proc_udplite6_fail; if (ipv6_misc_proc_init()) goto proc_misc6_fail; if (if6_proc_init()) goto proc_if6_fail; #endif err = ip6_route_init(); if (err) goto ip6_route_fail; err = ndisc_late_init(); if (err) goto ndisc_late_fail; err = ip6_flowlabel_init(); if (err) goto ip6_flowlabel_fail; err = ipv6_anycast_init(); if (err) goto ipv6_anycast_fail; err = addrconf_init(); if (err) goto addrconf_fail; /* Init v6 extension headers. */ err = ipv6_exthdrs_init(); if (err) goto ipv6_exthdrs_fail; err = ipv6_frag_init(); if (err) goto ipv6_frag_fail; /* Init v6 transport protocols. */ err = udpv6_init(); if (err) goto udpv6_fail; err = udplitev6_init(); if (err) goto udplitev6_fail; err = udpv6_offload_init(); if (err) goto udpv6_offload_fail; err = tcpv6_init(); if (err) goto tcpv6_fail; err = ipv6_packet_init(); if (err) goto ipv6_packet_fail; err = pingv6_init(); if (err) goto pingv6_fail; err = calipso_init(); if (err) goto calipso_fail; err = seg6_init(); if (err) goto seg6_fail; err = rpl_init(); if (err) goto rpl_fail; err = ioam6_init(); if (err) goto ioam6_fail; err = igmp6_late_init(); if (err) goto igmp6_late_err; #ifdef CONFIG_SYSCTL err = ipv6_sysctl_register(); if (err) goto sysctl_fail; #endif /* ensure that ipv6 stubs are visible only after ipv6 is ready */ wmb(); ipv6_stub = &ipv6_stub_impl; ipv6_bpf_stub = &ipv6_bpf_stub_impl; out: return err; #ifdef CONFIG_SYSCTL sysctl_fail: igmp6_late_cleanup(); #endif igmp6_late_err: ioam6_exit(); ioam6_fail: rpl_exit(); rpl_fail: seg6_exit(); seg6_fail: calipso_exit(); calipso_fail: pingv6_exit(); pingv6_fail: ipv6_packet_cleanup(); ipv6_packet_fail: tcpv6_exit(); tcpv6_fail: udpv6_offload_exit(); udpv6_offload_fail: udplitev6_exit(); udplitev6_fail: udpv6_exit(); udpv6_fail: ipv6_frag_exit(); ipv6_frag_fail: ipv6_exthdrs_exit(); ipv6_exthdrs_fail: addrconf_cleanup(); addrconf_fail: ipv6_anycast_cleanup(); ipv6_anycast_fail: ip6_flowlabel_cleanup(); ip6_flowlabel_fail: ndisc_late_cleanup(); ndisc_late_fail: ip6_route_cleanup(); ip6_route_fail: #ifdef CONFIG_PROC_FS if6_proc_exit(); proc_if6_fail: ipv6_misc_proc_exit(); proc_misc6_fail: udplite6_proc_exit(); proc_udplite6_fail: raw6_proc_exit(); proc_raw6_fail: #endif ipv6_netfilter_fini(); netfilter_fail: igmp6_cleanup(); igmp_fail: ndisc_cleanup(); ndisc_fail: icmpv6_cleanup(); icmp_fail: ip6_mr_cleanup(); ipmr_fail: unregister_pernet_subsys(&inet6_net_ops); register_pernet_fail: sock_unregister(PF_INET6); rtnl_unregister_all(PF_INET6); out_sock_register_fail: rawv6_exit(); out_unregister_ping_proto: proto_unregister(&pingv6_prot); out_unregister_raw_proto: proto_unregister(&rawv6_prot); out_unregister_udplite_proto: proto_unregister(&udplitev6_prot); out_unregister_udp_proto: proto_unregister(&udpv6_prot); out_unregister_tcp_proto: proto_unregister(&tcpv6_prot); goto out; } module_init(inet6_init); MODULE_ALIAS_NETPROTO(PF_INET6); |
18 18 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Sysfs attributes of bridge ports * Linux ethernet bridge * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_bridge.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/sched/signal.h> #include "br_private.h" /* IMPORTANT: new bridge port options must be added with netlink support only * please do not add new sysfs entries */ struct brport_attribute { struct attribute attr; ssize_t (*show)(struct net_bridge_port *, char *); int (*store)(struct net_bridge_port *, unsigned long); int (*store_raw)(struct net_bridge_port *, char *); }; #define BRPORT_ATTR_RAW(_name, _mode, _show, _store) \ const struct brport_attribute brport_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = _mode }, \ .show = _show, \ .store_raw = _store, \ }; #define BRPORT_ATTR(_name, _mode, _show, _store) \ const struct brport_attribute brport_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = _mode }, \ .show = _show, \ .store = _store, \ }; #define BRPORT_ATTR_FLAG(_name, _mask) \ static ssize_t show_##_name(struct net_bridge_port *p, char *buf) \ { \ return sprintf(buf, "%d\n", !!(p->flags & _mask)); \ } \ static int store_##_name(struct net_bridge_port *p, unsigned long v) \ { \ return store_flag(p, v, _mask); \ } \ static BRPORT_ATTR(_name, 0644, \ show_##_name, store_##_name) static int store_flag(struct net_bridge_port *p, unsigned long v, unsigned long mask) { struct netlink_ext_ack extack = {0}; unsigned long flags = p->flags; int err; if (v) flags |= mask; else flags &= ~mask; if (flags != p->flags) { err = br_switchdev_set_port_flag(p, flags, mask, &extack); if (err) { netdev_err(p->dev, "%s\n", extack._msg); return err; } p->flags = flags; br_port_flags_change(p, mask); } return 0; } static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->path_cost); } static BRPORT_ATTR(path_cost, 0644, show_path_cost, br_stp_set_path_cost); static ssize_t show_priority(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->priority); } static BRPORT_ATTR(priority, 0644, show_priority, br_stp_set_port_priority); static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) { return br_show_bridge_id(buf, &p->designated_root); } static BRPORT_ATTR(designated_root, 0444, show_designated_root, NULL); static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf) { return br_show_bridge_id(buf, &p->designated_bridge); } static BRPORT_ATTR(designated_bridge, 0444, show_designated_bridge, NULL); static ssize_t show_designated_port(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->designated_port); } static BRPORT_ATTR(designated_port, 0444, show_designated_port, NULL); static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->designated_cost); } static BRPORT_ATTR(designated_cost, 0444, show_designated_cost, NULL); static ssize_t show_port_id(struct net_bridge_port *p, char *buf) { return sprintf(buf, "0x%x\n", p->port_id); } static BRPORT_ATTR(port_id, 0444, show_port_id, NULL); static ssize_t show_port_no(struct net_bridge_port *p, char *buf) { return sprintf(buf, "0x%x\n", p->port_no); } static BRPORT_ATTR(port_no, 0444, show_port_no, NULL); static ssize_t show_change_ack(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->topology_change_ack); } static BRPORT_ATTR(change_ack, 0444, show_change_ack, NULL); static ssize_t show_config_pending(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->config_pending); } static BRPORT_ATTR(config_pending, 0444, show_config_pending, NULL); static ssize_t show_port_state(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->state); } static BRPORT_ATTR(state, 0444, show_port_state, NULL); static ssize_t show_message_age_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer)); } static BRPORT_ATTR(message_age_timer, 0444, show_message_age_timer, NULL); static ssize_t show_forward_delay_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); } static BRPORT_ATTR(forward_delay_timer, 0444, show_forward_delay_timer, NULL); static ssize_t show_hold_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer)); } static BRPORT_ATTR(hold_timer, 0444, show_hold_timer, NULL); static int store_flush(struct net_bridge_port *p, unsigned long v) { br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry return 0; } static BRPORT_ATTR(flush, 0200, NULL, store_flush); static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%#x\n", p->group_fwd_mask); } static int store_group_fwd_mask(struct net_bridge_port *p, unsigned long v) { if (v & BR_GROUPFWD_MACPAUSE) return -EINVAL; p->group_fwd_mask = v; return 0; } static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask, store_group_fwd_mask); static ssize_t show_backup_port(struct net_bridge_port *p, char *buf) { struct net_bridge_port *backup_p; int ret = 0; rcu_read_lock(); backup_p = rcu_dereference(p->backup_port); if (backup_p) ret = sprintf(buf, "%s\n", backup_p->dev->name); rcu_read_unlock(); return ret; } static int store_backup_port(struct net_bridge_port *p, char *buf) { struct net_device *backup_dev = NULL; char *nl = strchr(buf, '\n'); if (nl) *nl = '\0'; if (strlen(buf) > 0) { backup_dev = __dev_get_by_name(dev_net(p->dev), buf); if (!backup_dev) return -ENOENT; } return nbp_backup_change(p, backup_dev); } static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port); BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); BRPORT_ATTR_FLAG(learning, BR_LEARNING); BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS); BRPORT_ATTR_FLAG(isolated, BR_ISOLATED); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router); } static int store_multicast_router(struct net_bridge_port *p, unsigned long v) { return br_multicast_set_port_router(&p->multicast_ctx, v); } static BRPORT_ATTR(multicast_router, 0644, show_multicast_router, store_multicast_router); BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE); BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST); #endif static const struct brport_attribute *brport_attrs[] = { &brport_attr_path_cost, &brport_attr_priority, &brport_attr_port_id, &brport_attr_port_no, &brport_attr_designated_root, &brport_attr_designated_bridge, &brport_attr_designated_port, &brport_attr_designated_cost, &brport_attr_state, &brport_attr_change_ack, &brport_attr_config_pending, &brport_attr_message_age_timer, &brport_attr_forward_delay_timer, &brport_attr_hold_timer, &brport_attr_flush, &brport_attr_hairpin_mode, &brport_attr_bpdu_guard, &brport_attr_root_block, &brport_attr_learning, &brport_attr_unicast_flood, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, &brport_attr_multicast_to_unicast, #endif &brport_attr_proxyarp, &brport_attr_proxyarp_wifi, &brport_attr_multicast_flood, &brport_attr_broadcast_flood, &brport_attr_group_fwd_mask, &brport_attr_neigh_suppress, &brport_attr_isolated, &brport_attr_backup_port, NULL }; #define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr) static ssize_t brport_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct brport_attribute *brport_attr = to_brport_attr(attr); struct net_bridge_port *p = kobj_to_brport(kobj); if (!brport_attr->show) return -EINVAL; return brport_attr->show(p, buf); } static ssize_t brport_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct brport_attribute *brport_attr = to_brport_attr(attr); struct net_bridge_port *p = kobj_to_brport(kobj); ssize_t ret = -EINVAL; unsigned long val; char *endp; if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!rtnl_trylock()) return restart_syscall(); if (brport_attr->store_raw) { char *buf_copy; buf_copy = kstrndup(buf, count, GFP_KERNEL); if (!buf_copy) { ret = -ENOMEM; goto out_unlock; } spin_lock_bh(&p->br->lock); ret = brport_attr->store_raw(p, buf_copy); spin_unlock_bh(&p->br->lock); kfree(buf_copy); } else if (brport_attr->store) { val = simple_strtoul(buf, &endp, 0); if (endp == buf) goto out_unlock; spin_lock_bh(&p->br->lock); ret = brport_attr->store(p, val); spin_unlock_bh(&p->br->lock); } if (!ret) { br_ifinfo_notify(RTM_NEWLINK, NULL, p); ret = count; } out_unlock: rtnl_unlock(); return ret; } const struct sysfs_ops brport_sysfs_ops = { .show = brport_show, .store = brport_store, }; /* * Add sysfs entries to ethernet device added to a bridge. * Creates a brport subdirectory with bridge attributes. * Puts symlink in bridge's brif subdirectory */ int br_sysfs_addif(struct net_bridge_port *p) { struct net_bridge *br = p->br; const struct brport_attribute **a; int err; err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj, SYSFS_BRIDGE_PORT_LINK); if (err) return err; for (a = brport_attrs; *a; ++a) { err = sysfs_create_file(&p->kobj, &((*a)->attr)); if (err) return err; } strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ); return sysfs_create_link(br->ifobj, &p->kobj, p->sysfs_name); } /* Rename bridge's brif symlink */ int br_sysfs_renameif(struct net_bridge_port *p) { struct net_bridge *br = p->br; int err; /* If a rename fails, the rollback will cause another * rename call with the existing name. */ if (!strncmp(p->sysfs_name, p->dev->name, IFNAMSIZ)) return 0; err = sysfs_rename_link(br->ifobj, &p->kobj, p->sysfs_name, p->dev->name); if (err) netdev_notice(br->dev, "unable to rename link %s to %s", p->sysfs_name, p->dev->name); else strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ); return err; } |
12 2 199 2 6 2 10 108 106 106 7 7 7 101 100 15 2 9 106 7 7 257 257 7 257 255 7 257 7 6 1 14 15 15 15 15 15 101 101 99 2 1 100 80 13 100 100 101 17 101 101 92 22 100 100 100 100 6 6 101 100 100 100 102 1 67 3 31 100 99 100 7 7 15 4 2 9 100 100 100 273 8 196 11 186 11 2 125 148 49 34 51 228 165 66 2 18 8 190 203 4 3 190 3 201 41 1 200 185 1 15 7 7 1 81 44 125 124 1 25 66 103 42 4 38 6 3 35 35 32 3 90 59 32 32 4 48 2 2 75 8 54 3 9 2 53 13 7 35 3 1 31 31 140 138 2 137 1 87 87 157 5 131 1 131 65 82 3 50 27 96 96 6 84 6 2 88 7 95 32 48 80 15 8 7 95 56 16 128 25 21 4 44 30 2 5 3 30 27 27 21 10 6 5 5 6 3 8 3 61 1 4 22 6 18 18 124 103 104 103 42 104 51 77 27 7 8 67 22 7 44 3 3 9 8 50 16 66 66 2 27 25 3 10 34 3 36 3 23 18 80 83 77 11 129 16 113 15 21 19 6 3 15 1 1 553 533 1 13 4 2 2 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET4: Implementation of BSD Unix domain sockets. * * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> * * Fixes: * Linus Torvalds : Assorted bug cures. * Niibe Yutaka : async I/O support. * Carsten Paeth : PF_UNIX check, address fixes. * Alan Cox : Limit size of allocated blocks. * Alan Cox : Fixed the stupid socketpair bug. * Alan Cox : BSD compatibility fine tuning. * Alan Cox : Fixed a bug in connect when interrupted. * Alan Cox : Sorted out a proper draft version of * file descriptor passing hacked up from * Mike Shaver's work. * Marty Leisner : Fixes to fd passing * Nick Nevin : recvmsg bugfix. * Alan Cox : Started proper garbage collector * Heiko EiBfeldt : Missing verify_area check * Alan Cox : Started POSIXisms * Andreas Schwab : Replace inode by dentry for proper * reference counting * Kirk Petersen : Made this a module * Christoph Rohland : Elegant non-blocking accept/connect algorithm. * Lots of bug fixes. * Alexey Kuznetosv : Repaired (I hope) bugs introduces * by above two patches. * Andrea Arcangeli : If possible we block in connect(2) * if the max backlog of the listen socket * is been reached. This won't break * old apps and it will avoid huge amount * of socks hashed (this for unix_gc() * performances reasons). * Security fix that limits the max * number of socks to 2*max_files and * the number of skb queueable in the * dgram receiver. * Artur Skawina : Hash function optimizations * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) * Malcolm Beattie : Set peercred for socketpair * Michal Ostrowski : Module initialization cleanup. * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, * the core infrastructure is doing that * for all net proto families now (2.5.69+) * * Known differences from reference BSD that was tested: * * [TO FIX] * ECONNREFUSED is not returned from one end of a connected() socket to the * other the moment one end closes. * fstat() doesn't return st_dev=0, and give the blksize as high water mark * and a fake inode identifier (nor the BSD first socket fstat twice bug). * [NOT TO FIX] * accept() returns a path name even if the connecting socket has closed * in the meantime (BSD loses the path and gives up). * accept() returns 0 length path for an unbound connector. BSD returns 16 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) * socketpair(...SOCK_RAW..) doesn't panic the kernel. * BSD af_unix apparently has connect forgetting to block properly. * (need to check this with the POSIX spec in detail) * * Differences from 2.0.0-11-... (ANK) * Bug fixes and improvements. * - client shutdown killed server socket. * - removed all useless cli/sti pairs. * * Semantic changes/extensions. * - generic control message passing. * - SCM_CREDENTIALS control message. * - "Abstract" (not FS based) socket bindings. * Abstract names are sequences of bytes (not zero terminated) * started by 0, so that this name space does not intersect * with BSD names. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/af_unix.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/scm.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/rtnetlink.h> #include <linux/mount.h> #include <net/checksum.h> #include <linux/security.h> #include <linux/freezer.h> #include <linux/file.h> #include <linux/btf_ids.h> #include "scm.h" struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); DEFINE_SPINLOCK(unix_table_lock); EXPORT_SYMBOL_GPL(unix_table_lock); static atomic_long_t unix_nr_socks; static struct hlist_head *unix_sockets_unbound(void *addr) { unsigned long hash = (unsigned long)addr; hash ^= hash >> 16; hash ^= hash >> 8; hash %= UNIX_HASH_SIZE; return &unix_socket_table[UNIX_HASH_SIZE + hash]; } #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE) #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { UNIXCB(skb).secid = scm->secid; } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { scm->secid = UNIXCB(skb).secid; } static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) { return (scm->secid == UNIXCB(skb).secid); } #else static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) { return true; } #endif /* CONFIG_SECURITY_NETWORK */ /* * SMP locking strategy: * hash table is protected with spinlock unix_table_lock * each socket state is protected by separate spin lock. */ static inline unsigned int unix_hash_fold(__wsum n) { unsigned int hash = (__force unsigned int)csum_fold(n); hash ^= hash>>8; return hash&(UNIX_HASH_SIZE-1); } #define unix_peer(sk) (unix_sk(sk)->peer) static inline int unix_our_peer(struct sock *sk, struct sock *osk) { return unix_peer(osk) == sk; } static inline int unix_may_send(struct sock *sk, struct sock *osk) { return unix_peer(osk) == NULL || unix_our_peer(sk, osk); } static inline int unix_recvq_full_lockless(const struct sock *sk) { return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } struct sock *unix_peer_get(struct sock *s) { struct sock *peer; unix_state_lock(s); peer = unix_peer(s); if (peer) sock_hold(peer); unix_state_unlock(s); return peer; } EXPORT_SYMBOL_GPL(unix_peer_get); static inline void unix_release_addr(struct unix_address *addr) { if (refcount_dec_and_test(&addr->refcnt)) kfree(addr); } /* * Check unix socket name: * - should be not zero length. * - if started by not zero, should be NULL terminated (FS object) * - if started by zero, it is abstract name. */ static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp) { *hashp = 0; if (len <= sizeof(short) || len > sizeof(*sunaddr)) return -EINVAL; if (!sunaddr || sunaddr->sun_family != AF_UNIX) return -EINVAL; if (sunaddr->sun_path[0]) { /* * This may look like an off by one error but it is a bit more * subtle. 108 is the longest valid AF_UNIX path for a binding. * sun_path[108] doesn't as such exist. However in kernel space * we are guaranteed that it is a valid memory location in our * kernel address buffer. */ ((char *)sunaddr)[len] = 0; len = strlen(sunaddr->sun_path)+1+sizeof(short); return len; } *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0)); return len; } static void __unix_remove_socket(struct sock *sk) { sk_del_node_init(sk); } static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) { WARN_ON(!sk_unhashed(sk)); sk_add_node(sk, list); } static void __unix_set_addr(struct sock *sk, struct unix_address *addr, unsigned hash) { __unix_remove_socket(sk); smp_store_release(&unix_sk(sk)->addr, addr); __unix_insert_socket(&unix_socket_table[hash], sk); } static inline void unix_remove_socket(struct sock *sk) { spin_lock(&unix_table_lock); __unix_remove_socket(sk); spin_unlock(&unix_table_lock); } static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) { spin_lock(&unix_table_lock); __unix_insert_socket(list, sk); spin_unlock(&unix_table_lock); } static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) { struct sock *s; sk_for_each(s, &unix_socket_table[hash]) { struct unix_sock *u = unix_sk(s); if (!net_eq(sock_net(s), net)) continue; if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) return s; } return NULL; } static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) { struct sock *s; spin_lock(&unix_table_lock); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&unix_table_lock); return s; } static struct sock *unix_find_socket_byinode(struct inode *i) { struct sock *s; spin_lock(&unix_table_lock); sk_for_each(s, &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); goto found; } } s = NULL; found: spin_unlock(&unix_table_lock); return s; } /* Support code for asymmetrically connected dgram sockets * * If a datagram socket is connected to a socket not itself connected * to the first socket (eg, /dev/log), clients may only enqueue more * messages if the present receive queue of the server socket is not * "too large". This means there's a second writeability condition * poll and sendmsg need to test. The dgram recv code will do a wake * up on the peer_wait wait queue of a socket upon reception of a * datagram which needs to be propagated to sleeping would-be writers * since these might not have sent anything so far. This can't be * accomplished via poll_wait because the lifetime of the server * socket might be less than that of its clients if these break their * association with it or if the server socket is closed while clients * are still connected to it and there's no way to inform "a polling * implementation" that it should let go of a certain wait queue * * In order to propagate a wake up, a wait_queue_entry_t of the client * socket is enqueued on the peer_wait queue of the server socket * whose wake function does a wake_up on the ordinary client socket * wait queue. This connection is established whenever a write (or * poll for write) hit the flow control condition and broken when the * association to the server socket is dissolved or after a wake up * was relayed. */ static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, void *key) { struct unix_sock *u; wait_queue_head_t *u_sleep; u = container_of(q, struct unix_sock, peer_wake); __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, q); u->peer_wake.private = NULL; /* relaying can only happen while the wq still exists */ u_sleep = sk_sleep(&u->sk); if (u_sleep) wake_up_interruptible_poll(u_sleep, key_to_poll(key)); return 0; } static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) { struct unix_sock *u, *u_other; int rc; u = unix_sk(sk); u_other = unix_sk(other); rc = 0; spin_lock(&u_other->peer_wait.lock); if (!u->peer_wake.private) { u->peer_wake.private = other; __add_wait_queue(&u_other->peer_wait, &u->peer_wake); rc = 1; } spin_unlock(&u_other->peer_wait.lock); return rc; } static void unix_dgram_peer_wake_disconnect(struct sock *sk, struct sock *other) { struct unix_sock *u, *u_other; u = unix_sk(sk); u_other = unix_sk(other); spin_lock(&u_other->peer_wait.lock); if (u->peer_wake.private == other) { __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); u->peer_wake.private = NULL; } spin_unlock(&u_other->peer_wait.lock); } static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, struct sock *other) { unix_dgram_peer_wake_disconnect(sk, other); wake_up_interruptible_poll(sk_sleep(sk), EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); } /* preconditions: * - unix_peer(sk) == other * - association is stable */ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) { int connected; connected = unix_dgram_peer_wake_connect(sk, other); /* If other is SOCK_DEAD, we want to make sure we signal * POLLOUT, such that a subsequent write() can get a * -ECONNREFUSED. Otherwise, if we haven't queued any skbs * to other and its full, we will hang waiting for POLLOUT. */ if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) return 1; if (connected) unix_dgram_peer_wake_disconnect(sk, other); return 0; } static int unix_writable(const struct sock *sk, unsigned char state) { return state != TCP_LISTEN && (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; } static void unix_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); if (unix_writable(sk, READ_ONCE(sk->sk_state))) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* When dgram socket disconnects (or changes its peer), we clear its receive * queue of packets arrived from previous peer. First, it allows to do * flow control based only on wmem_alloc; second, sk connected to peer * may receive messages only from that peer. */ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) { if (!skb_queue_empty(&sk->sk_receive_queue)) { skb_queue_purge(&sk->sk_receive_queue); wake_up_interruptible_all(&unix_sk(sk)->peer_wait); /* If one link of bidirectional dgram pipe is disconnected, * we signal error. Messages are lost. Do not make this, * when peer was not connected to us. */ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { WRITE_ONCE(other->sk_err, ECONNRESET); sk_error_report(other); } } } static void unix_sock_destructor(struct sock *sk) { struct unix_sock *u = unix_sk(sk); skb_queue_purge(&sk->sk_receive_queue); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(!sk_unhashed(sk)); WARN_ON(sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_info("Attempt to release alive unix socket: %p\n", sk); return; } if (u->addr) unix_release_addr(u->addr); atomic_long_dec(&unix_nr_socks); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); #ifdef UNIX_REFCNT_DEBUG pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, atomic_long_read(&unix_nr_socks)); #endif } static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); struct path path; struct sock *skpair; struct sk_buff *skb; int state; unix_remove_socket(sk); /* Clear state */ unix_state_lock(sk); sock_orphan(sk); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); path = u->path; u->path.dentry = NULL; u->path.mnt = NULL; state = sk->sk_state; WRITE_ONCE(sk->sk_state, TCP_CLOSE); skpair = unix_peer(sk); unix_peer(sk) = NULL; unix_state_unlock(sk); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (u->oob_skb) { kfree_skb(u->oob_skb); u->oob_skb = NULL; } #endif wake_up_interruptible_all(&u->peer_wait); if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); } unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ } /* Try to flush out this socket. Throw out buffers at least */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); /* passed fds are erased in the kfree_skb hook */ UNIXCB(skb).consumed = skb->len; kfree_skb(skb); } if (path.dentry) path_put(&path); sock_put(sk); /* ---- Socket is dead now and most probably destroyed ---- */ /* * Fixme: BSD difference: In BSD all sockets connected to us get * ECONNRESET and we die on the spot. In Linux we behave * like files and pipes do and wait for the last * dereference. * * Can't we simply set sock->err? * * What the above comment does talk about? --ANK(980817) */ if (READ_ONCE(unix_tot_inflight)) unix_gc(); /* Garbage collect fds */ } static void init_peercred(struct sock *sk) { const struct cred *old_cred; struct pid *old_pid; spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); spin_unlock(&sk->sk_peer_lock); put_pid(old_pid); put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { if (sk < peersk) { spin_lock(&sk->sk_peer_lock); spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&peersk->sk_peer_lock); spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); } sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); spin_unlock(&peersk->sk_peer_lock); } static int unix_listen(struct socket *sock, int backlog) { int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; /* Only stream/seqpacket sockets accept */ err = -EINVAL; if (!u->addr) goto out; /* No listens on an unbound socket */ unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; if (backlog > sk->sk_max_ack_backlog) wake_up_interruptible_all(&u->peer_wait); sk->sk_max_ack_backlog = backlog; WRITE_ONCE(sk->sk_state, TCP_LISTEN); /* set credentials so connect can copy them */ init_peercred(sk); err = 0; out_unlock: unix_state_unlock(sk); out: return err; } static int unix_release(struct socket *); static int unix_bind(struct socket *, struct sockaddr *, int); static int unix_stream_connect(struct socket *, struct sockaddr *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); static int unix_getname(struct socket *, struct sockaddr *, int); static __poll_t unix_poll(struct file *, struct socket *, poll_table *); static __poll_t unix_dgram_poll(struct file *, struct socket *, poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); #endif static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset, size_t size, int flags); static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, struct pipe_inode_info *, size_t size, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_set_peek_off(struct sock *sk, int val) { struct unix_sock *u = unix_sk(sk); if (mutex_lock_interruptible(&u->iolock)) return -EINTR; WRITE_ONCE(sk->sk_peek_off, val); mutex_unlock(&u->iolock); return 0; } #ifdef CONFIG_PROC_FS static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; struct unix_sock *u; if (sk) { u = unix_sk(sock->sk); seq_printf(m, "scm_fds: %u\n", atomic_read(&u->scm_stat.nr_fds)); } } #else #define unix_show_fdinfo NULL #endif static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_stream_connect, .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, .poll = unix_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = unix_listen, .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .read_sock = unix_stream_read_sock, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_dgram_connect, .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, .read_sock = unix_read_sock, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_stream_connect, .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = unix_listen, .shutdown = unix_shutdown, .sendmsg = unix_seqpacket_sendmsg, .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static void unix_close(struct sock *sk, long timeout) { /* Nothing to do here, unix socket does not need a ->close(). * This is merely for sockmap. */ } static void unix_unhash(struct sock *sk) { /* Nothing to do here, unix socket does not need a ->unhash(). * This is merely for sockmap. */ } struct proto unix_dgram_proto = { .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_dgram_bpf_update_proto, #endif }; struct proto unix_stream_proto = { .name = "UNIX-STREAM", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, .unhash = unix_unhash, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, #endif }; static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { struct unix_sock *u; struct sock *sk; int err; atomic_long_inc(&unix_nr_socks); if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { err = -ENFILE; goto err; } if (type == SOCK_STREAM) sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); else /*dgram and seqpacket */ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); if (!sk) { err = -ENOMEM; goto err; } sock_init_data(sock, sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->inflight = 0; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return sk; err: atomic_long_dec(&unix_nr_socks); return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; sock->state = SS_UNCONNECTED; switch (sock->type) { case SOCK_STREAM: sock->ops = &unix_stream_ops; break; /* * Believe it or not BSD has AF_UNIX, SOCK_RAW though * nothing uses it. */ case SOCK_RAW: sock->type = SOCK_DGRAM; fallthrough; case SOCK_DGRAM: sock->ops = &unix_dgram_ops; break; case SOCK_SEQPACKET: sock->ops = &unix_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } sk = unix_create1(net, sock, kern, sock->type); if (IS_ERR(sk)) return PTR_ERR(sk); return 0; } static int unix_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; sk->sk_prot->close(sk, 0); unix_release_sock(sk, 0); sock->sk = NULL; return 0; } static int unix_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk); static u32 ordernum = 1; struct unix_address *addr; int err; unsigned int retries = 0; err = mutex_lock_interruptible(&u->bindlock); if (err) return err; if (u->addr) goto out; err = -ENOMEM; addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); if (!addr) goto out; addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); retry: addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); addr->hash ^= sk->sk_type; spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) { spin_unlock(&unix_table_lock); /* * __unix_find_socket_byname() may take long time if many names * are already in use. */ cond_resched(); /* Give up if all names seems to be in use. */ if (retries++ == 0xFFFFF) { err = -ENOSPC; kfree(addr); goto out; } goto retry; } __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); err = 0; out: mutex_unlock(&u->bindlock); return err; } static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunname, int len, int type, unsigned int hash, int *error) { struct sock *u; struct path path; int err = 0; if (sunname->sun_path[0]) { struct inode *inode; err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; inode = d_backing_inode(path.dentry); err = path_permission(&path, MAY_WRITE); if (err) goto put_fail; err = -ECONNREFUSED; if (!S_ISSOCK(inode->i_mode)) goto put_fail; u = unix_find_socket_byinode(inode); if (!u) goto put_fail; if (u->sk_type == type) touch_atime(&path); path_put(&path); err = -EPROTOTYPE; if (u->sk_type != type) { sock_put(u); goto fail; } } else { err = -ECONNREFUSED; u = unix_find_socket_byname(net, sunname, len, type ^ hash); if (u) { struct dentry *dentry; dentry = unix_sk(u)->path.dentry; if (dentry) touch_atime(&unix_sk(u)->path); } else goto fail; } return u; put_fail: path_put(&path); fail: *error = err; return NULL; } static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) { struct unix_sock *u = unix_sk(sk); umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); struct user_namespace *ns; // barf... struct path parent; struct dentry *dentry; unsigned int hash; int err; /* * Get the parent directory, calculate the hash for last * component. */ dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); ns = mnt_user_ns(parent.mnt); /* * All right, let's create it. */ err = security_path_mknod(&parent, dentry, mode, 0); if (!err) err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0); if (err) goto out; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out_unlink; if (u->addr) goto out_unlock; addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); __unix_set_addr(sk, addr, hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; out_unlock: mutex_unlock(&u->bindlock); err = -EINVAL; out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL); out: done_path_create(&parent, dentry); return err; } static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) { struct unix_sock *u = unix_sk(sk); int err; err = mutex_lock_interruptible(&u->bindlock); if (err) return err; if (u->addr) { mutex_unlock(&u->bindlock); return -EINVAL; } spin_lock(&unix_table_lock); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, addr->hash)) { spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); return -EADDRINUSE; } __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); return 0; } static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; int err; unsigned int hash; struct unix_address *addr; if (addr_len < offsetofend(struct sockaddr_un, sun_family) || sunaddr->sun_family != AF_UNIX) return -EINVAL; if (addr_len == sizeof(short)) return unix_autobind(sock); err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) return err; addr_len = err; addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) return -ENOMEM; memcpy(addr->name, sunaddr, addr_len); addr->len = addr_len; addr->hash = hash ^ sk->sk_type; refcount_set(&addr->refcnt, 1); if (sun_path[0]) err = unix_bind_bsd(sk, addr); else err = unix_bind_abstract(sk, addr); if (err) unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) { if (unlikely(sk1 == sk2) || !sk2) { unix_state_lock(sk1); return; } if (sk1 > sk2) swap(sk1, sk2); unix_state_lock(sk1); unix_state_lock_nested(sk2, U_LOCK_SECOND); } static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) { if (unlikely(sk1 == sk2) || !sk2) { unix_state_unlock(sk1); return; } unix_state_unlock(sk1); unix_state_unlock(sk2); } static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; struct sock *other; unsigned int hash; int err; err = -EINVAL; if (alen < offsetofend(struct sockaddr, sa_family)) goto out; if (addr->sa_family != AF_UNSPEC) { err = unix_mkname(sunaddr, alen, &hash); if (err < 0) goto out; alen = err; if (test_bit(SOCK_PASSCRED, &sock->flags) && !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0) goto out; restart: other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err); if (!other) goto out; unix_state_double_lock(sk, other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { unix_state_double_unlock(sk, other); sock_put(other); goto restart; } err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock; WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); } else { /* * 1003.1g breaking connected state with AF_UNSPEC */ other = NULL; unix_state_double_lock(sk, other); } /* * If it was connected, reconnect. */ if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk) = other; if (!other) WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); unix_state_double_unlock(sk, other); if (other != old_peer) { unix_dgram_disconnected(sk, old_peer); unix_state_lock(old_peer); if (!unix_peer(old_peer)) WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); unix_state_unlock(old_peer); } sock_put(old_peer); } else { unix_peer(sk) = other; unix_state_double_unlock(sk, other); } return 0; out_unlock: unix_state_double_unlock(sk, other); sock_put(other); out: return err; } static long unix_wait_for_peer(struct sock *other, long timeo) __releases(&unix_sk(other)->lock) { struct unix_sock *u = unix_sk(other); int sched; DEFINE_WAIT(wait); prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); sched = !sock_flag(other, SOCK_DEAD) && !(other->sk_shutdown & RCV_SHUTDOWN) && unix_recvq_full_lockless(other); unix_state_unlock(other); if (sched) timeo = schedule_timeout(timeo); finish_wait(&u->peer_wait, &wait); return timeo; } static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk), *newu, *otheru; struct sock *newsk = NULL; struct sock *other = NULL; struct sk_buff *skb = NULL; unsigned int hash; int err; long timeo; err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) goto out; addr_len = err; if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr && (err = unix_autobind(sock)) != 0) goto out; timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); /* First of all allocate resources. If we will make it after state is locked, we will have to recheck all again in any case. */ /* create new sock for complete connection */ newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); newsk = NULL; goto out; } err = -ENOMEM; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (skb == NULL) goto out; restart: /* Find listening sock. */ other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err); if (!other) goto out; /* Latch state of peer */ unix_state_lock(other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { unix_state_unlock(other); sock_put(other); goto restart; } err = -ECONNREFUSED; if (other->sk_state != TCP_LISTEN) goto out_unlock; if (other->sk_shutdown & RCV_SHUTDOWN) goto out_unlock; if (unix_recvq_full_lockless(other)) { err = -EAGAIN; if (!timeo) goto out_unlock; timeo = unix_wait_for_peer(other, timeo); err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; sock_put(other); goto restart; } /* Latch our state. It is tricky place. We need to grab our state lock and cannot drop lock on peer. It is dangerous because deadlock is possible. Connect to self case and simultaneous attempt to connect are eliminated by checking socket state. other is TCP_LISTEN, if sk is TCP_LISTEN we check this before attempt to grab lock. Well, and we have to recheck the state after socket locked. */ switch (READ_ONCE(sk->sk_state)) { case TCP_CLOSE: /* This is ok... continue with connect */ break; case TCP_ESTABLISHED: /* Socket is already connected */ err = -EISCONN; goto out_unlock; default: err = -EINVAL; goto out_unlock; } unix_state_lock_nested(sk, U_LOCK_SECOND); if (sk->sk_state != TCP_CLOSE) { unix_state_unlock(sk); unix_state_unlock(other); sock_put(other); goto restart; } err = security_unix_stream_connect(sk, other, newsk); if (err) { unix_state_unlock(sk); goto out_unlock; } /* The way is open! Fastly set all the necessary fields... */ sock_hold(sk); unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); /* copy address information from listening to new sock * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found * otheru in hash under unix_table_lock. Insertion * into the hash chain we'd found it in had been done * in an earlier critical area protected by unix_table_lock, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * * Using smp_store_release() here to set newu->addr * is enough to make those stores, as well as stores * to newu->path visible to anyone who gets newu->addr * by smp_load_acquire(). IOW, the same warranties * as for unix_sock instances bound in unix_bind() or * in unix_autobind(). */ if (otheru->path.dentry) { path_get(&otheru->path); newu->path = otheru->path; } refcount_inc(&otheru->addr->refcnt); smp_store_release(&newu->addr, otheru->addr); /* Set credentials */ copy_peercred(sk, other); sock->state = SS_CONNECTED; WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); sock_hold(newsk); smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ unix_peer(sk) = newsk; unix_state_unlock(sk); /* take ten and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); return 0; out_unlock: if (other) unix_state_unlock(other); out: kfree_skb(skb); if (newsk) unix_release_sock(newsk, 0); if (other) sock_put(other); return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) { struct sock *ska = socka->sk, *skb = sockb->sk; /* Join our sockets back to back */ sock_hold(ska); sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; init_peercred(ska); init_peercred(skb); ska->sk_state = TCP_ESTABLISHED; skb->sk_state = TCP_ESTABLISHED; socka->state = SS_CONNECTED; sockb->state = SS_CONNECTED; return 0; } static void unix_sock_inherit_flags(const struct socket *old, struct socket *new) { if (test_bit(SOCK_PASSCRED, &old->flags)) set_bit(SOCK_PASSCRED, &new->flags); if (test_bit(SOCK_PASSSEC, &old->flags)) set_bit(SOCK_PASSSEC, &new->flags); } static int unix_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; struct sock *tsk; struct sk_buff *skb; int err; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out; /* If socket state is TCP_LISTEN it cannot change (for now...), * so that no locks are necessary. */ skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err); if (!skb) { /* This means receive shutdown. */ if (err == 0) err = -EINVAL; goto out; } tsk = skb->sk; skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); /* attach accepted sock to socket */ unix_state_lock(tsk); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; out: return err; } static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct unix_address *addr; DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); int err = 0; if (peer) { sk = unix_peer_get(sk); err = -ENOTCONN; if (!sk) goto out; err = 0; } else { sock_hold(sk); } addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; err = sizeof(short); } else { err = addr->len; memcpy(sunaddr, addr->name, addr->len); } sock_put(sk); out: return err; } static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); /* * Garbage collection of unix sockets starts by selecting a set of * candidate sockets which have reference only from being in flight * (total_refs == inflight_refs). This condition is checked once during * the candidate collection phase, and candidates are marked as such, so * that non-candidates can later be ignored. While inflight_refs is * protected by unix_gc_lock, total_refs (file count) is not, hence this * is an instantaneous decision. * * Once a candidate, however, the socket must not be reinstalled into a * file descriptor while the garbage collection is in progress. * * If the above conditions are met, then the directed graph of * candidates (*) does not change while unix_gc_lock is held. * * Any operations that changes the file count through file descriptors * (dup, close, sendmsg) does not change the graph since candidates are * not installed in fds. * * Dequeing a candidate via recvmsg would install it into an fd, but * that takes unix_gc_lock to decrement the inflight count, so it's * serialized with garbage collection. * * MSG_PEEK is special in that it does not change the inflight count, * yet does install the socket into an fd. The following lock/unlock * pair is to ensure serialization with garbage collection. It must be * done between incrementing the file count and installing the file into * an fd. * * If garbage collection starts after the barrier provided by the * lock/unlock, then it will see the elevated refcount and not mark this * as a candidate. If a garbage collection is already in progress * before the file count was incremented, then the lock/unlock pair will * ensure that garbage collection is finished before progressing to * installing the fd. * * (*) A -> B where B is on the queue of A or B is on the queue of C * which is on the queue of listening socket A. */ spin_lock(&unix_gc_lock); spin_unlock(&unix_gc_lock); } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; UNIXCB(skb).pid = get_pid(scm->pid); UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; unix_get_secdata(scm, skb); if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); skb->destructor = unix_destruct_scm; return err; } static bool unix_passcred_enabled(const struct socket *sock, const struct sock *other) { return test_bit(SOCK_PASSCRED, &sock->flags) || !other->sk_socket || test_bit(SOCK_PASSCRED, &other->sk_socket->flags); } /* * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. */ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, const struct sock *other) { if (UNIXCB(skb).pid) return; if (unix_passcred_enabled(sock, other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } static int maybe_init_creds(struct scm_cookie *scm, struct socket *socket, const struct sock *other) { int err; struct msghdr msg = { .msg_controllen = 0 }; err = scm_send(socket, &msg, scm, false); if (err) return err; if (unix_passcred_enabled(socket, other)) { scm->pid = get_pid(task_tgid(current)); current_uid_gid(&scm->creds.uid, &scm->creds.gid); } return err; } static bool unix_skb_scm_eq(struct sk_buff *skb, struct scm_cookie *scm) { const struct unix_skb_parms *u = &UNIXCB(skb); return u->pid == scm->pid && uid_eq(u->uid, scm->creds.uid) && gid_eq(u->gid, scm->creds.gid) && unix_secdata_eq(scm, skb); } static void scm_stat_add(struct sock *sk, struct sk_buff *skb) { struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); if (unlikely(fp && fp->count)) atomic_add(fp->count, &u->scm_stat.nr_fds); } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) { struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); if (unlikely(fp && fp->count)) atomic_sub(fp->count, &u->scm_stat.nr_fds); } /* * Send AF_UNIX data. */ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk); DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *other = NULL; int namelen = 0; /* fake GCC */ int err; unsigned int hash; struct sk_buff *skb; long timeo; struct scm_cookie scm; int data_len = 0; int sk_locked; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; if (msg->msg_namelen) { err = unix_mkname(sunaddr, msg->msg_namelen, &hash); if (err < 0) goto out; namelen = err; } else { sunaddr = NULL; err = -ENOTCONN; other = unix_peer_get(sk); if (!other) goto out; } if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr && (err = unix_autobind(sock)) != 0) goto out; err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; if (len > SKB_MAX_ALLOC) { data_len = min_t(size_t, len - SKB_MAX_ALLOC, MAX_SKB_FRAGS * PAGE_SIZE); data_len = PAGE_ALIGN(data_len); BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); } skb = sock_alloc_send_pskb(sk, len - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, PAGE_ALLOC_COSTLY_ORDER); if (skb == NULL) goto out; err = unix_scm_to_skb(&scm, skb, true); if (err < 0) goto out_free; skb_put(skb, len - data_len); skb->data_len = data_len; skb->len = len; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); if (err) goto out_free; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); restart: if (!other) { err = -ECONNRESET; if (sunaddr == NULL) goto out_free; other = unix_find_other(net, sunaddr, namelen, sk->sk_type, hash, &err); if (other == NULL) goto out_free; } if (sk_filter(other, skb) < 0) { /* Toss the packet but do not return any error to the sender */ err = len; goto out_free; } sk_locked = 0; unix_state_lock(other); restart_locked: err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; if (unlikely(sock_flag(other, SOCK_DEAD))) { /* * Check with 1003.1g - what should * datagram error */ unix_state_unlock(other); sock_put(other); if (!sk_locked) unix_state_lock(sk); err = 0; if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE * unlike SOCK_DGRAM wants. */ unix_state_unlock(sk); err = -EPIPE; } else if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_state_unlock(sk); unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; } else { unix_state_unlock(sk); } other = NULL; if (err) goto out_free; goto restart; } err = -EPIPE; if (other->sk_shutdown & RCV_SHUTDOWN) goto out_unlock; if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock; } /* other == sk && unix_peer(other) != sk if * - unix_peer(sk) == NULL, destination address bound to sk * - unix_peer(sk) == sk by time of get but disconnected before lock */ if (other != sk && unlikely(unix_peer(other) != sk && unix_recvq_full_lockless(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); err = sock_intr_errno(timeo); if (signal_pending(current)) goto out_free; goto restart; } if (!sk_locked) { unix_state_unlock(other); unix_state_double_lock(sk, other); } if (unix_peer(sk) != other || unix_dgram_peer_wake_me(sk, other)) { err = -EAGAIN; sk_locked = 1; goto out_unlock; } if (!sk_locked) { sk_locked = 1; goto restart_locked; } } if (unlikely(sk_locked)) unix_state_unlock(sk); if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); scm_destroy(&scm); return len; out_unlock: if (sk_locked) unix_state_unlock(sk); unix_state_unlock(other); out_free: kfree_skb(skb); out: if (other) sock_put(other); scm_destroy(&scm); return err; } /* We use paged skbs for stream sockets, and limit occupancy to 32768 * bytes, and a minimum of a full page. */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, struct scm_cookie *scm, bool fds_sent) { struct unix_sock *ousk = unix_sk(other); struct sk_buff *skb; int err = 0; skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) return err; err = unix_scm_to_skb(scm, skb, !fds_sent); if (err < 0) { kfree_skb(skb); return err; } skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); if (err) { kfree_skb(skb); return err; } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { unix_state_unlock(other); kfree_skb(skb); return -EPIPE; } maybe_add_creds(skb, sock, other); skb_get(skb); scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); if (ousk->oob_skb) consume_skb(ousk->oob_skb); WRITE_ONCE(ousk->oob_skb, skb); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); sk_send_sigurg(other); unix_state_unlock(other); other->sk_data_ready(other); return err; } #endif static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sock *other = NULL; int err, size; struct sk_buff *skb; int sent = 0; struct scm_cookie scm; bool fds_sent = false; int data_len; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (len) len--; else #endif goto out_err; } if (msg->msg_namelen) { err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { err = -ENOTCONN; other = unix_peer(sk); if (!other) goto out_err; } if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto pipe_err; while (sent < len) { size = len - sent; /* Keep two messages in the pipe so it schedules better */ size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); /* allow fallback to order-0 allocations */ size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); skb = sock_alloc_send_pskb(sk, size - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, get_order(UNIX_SKB_FRAGS_SZ)); if (!skb) goto out_err; /* Only send the fds in the first buffer */ err = unix_scm_to_skb(&scm, skb, !fds_sent); if (err < 0) { kfree_skb(skb); goto out_err; } fds_sent = true; skb_put(skb, size - data_len); skb->data_len = data_len; skb->len = size; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) { kfree_skb(skb); goto out_err; } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) goto pipe_err_free; maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other); sent += size; } #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (msg->msg_flags & MSG_OOB) { err = queue_oob(sock, msg, other, &scm, fds_sent); if (err) goto out_err; sent++; } #endif scm_destroy(&scm); return sent; pipe_err_free: unix_state_unlock(other); kfree_skb(skb); pipe_err: if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; out_err: scm_destroy(&scm); return sent ? : err; } static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, int offset, size_t size, int flags) { int err; bool send_sigpipe = false; bool init_scm = true; struct scm_cookie scm; struct sock *other, *sk = socket->sk; struct sk_buff *skb, *newskb = NULL, *tail = NULL; if (flags & MSG_OOB) return -EOPNOTSUPP; other = unix_peer(sk); if (!other || sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; if (false) { alloc_skb: spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); mutex_unlock(&unix_sk(other)->iolock); newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, &err, 0); if (!newskb) goto err; } /* we must acquire iolock as we modify already present * skbs in the sk_receive_queue and mess with skb->len */ err = mutex_lock_interruptible(&unix_sk(other)->iolock); if (err) { err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; goto err; } if (sk->sk_shutdown & SEND_SHUTDOWN) { err = -EPIPE; send_sigpipe = true; goto err_unlock; } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || other->sk_shutdown & RCV_SHUTDOWN) { err = -EPIPE; send_sigpipe = true; goto err_state_unlock; } if (init_scm) { err = maybe_init_creds(&scm, socket, other); if (err) goto err_state_unlock; init_scm = false; } spin_lock(&other->sk_receive_queue.lock); skb = skb_peek_tail(&other->sk_receive_queue); if (tail && tail == skb) { skb = newskb; } else if (!skb || !unix_skb_scm_eq(skb, &scm)) { if (newskb) { skb = newskb; } else { tail = skb; goto alloc_skb; } } else if (newskb) { /* this is fast path, we don't necessarily need to * call to kfree_skb even though with newskb == NULL * this - does no harm */ consume_skb(newskb); newskb = NULL; } if (skb_append_pagefrags(skb, page, offset, size)) { tail = skb; goto alloc_skb; } skb->len += size; skb->data_len += size; skb->truesize += size; refcount_add(size, &sk->sk_wmem_alloc); if (newskb) { unix_scm_to_skb(&scm, skb, false); __skb_queue_tail(&other->sk_receive_queue, newskb); } spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); mutex_unlock(&unix_sk(other)->iolock); other->sk_data_ready(other); scm_destroy(&scm); return size; err_state_unlock: unix_state_unlock(other); err_unlock: mutex_unlock(&unix_sk(other)->iolock); err: kfree_skb(newskb); if (send_sigpipe && !(flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); if (!init_scm) scm_destroy(&scm); return err; } static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct sock *sk = sock->sk; err = sock_error(sk); if (err) return err; if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; if (msg->msg_namelen) msg->msg_namelen = 0; return unix_dgram_sendmsg(sock, msg, len); } static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; return unix_dgram_recvmsg(sock, msg, size, flags); } static void unix_copy_addr(struct msghdr *msg, struct sock *sk) { struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (addr) { msg->msg_namelen = addr->len; memcpy(msg->msg_name, addr->name, addr->len); } } int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags) { struct scm_cookie scm; struct socket *sock = sk->sk_socket; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; int skip; int err; err = -EOPNOTSUPP; if (flags&MSG_OOB) goto out; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, &skip, &err, &last); if (skb) { if (!(flags & MSG_PEEK)) scm_stat_del(sk, skb); break; } mutex_unlock(&u->iolock); if (err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && (sk->sk_shutdown & RCV_SHUTDOWN)) err = 0; unix_state_unlock(sk); goto out; } if (wq_has_sleeper(&u->peer_wait)) wake_up_interruptible_sync_poll(&u->peer_wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); if (msg->msg_name) unix_copy_addr(msg, skb->sk); if (size > skb->len - skip) size = skb->len - skip; else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free; if (sock_flag(sk, SOCK_RCVTSTAMP)) __sock_recv_timestamp(msg, sk, skb); memset(&scm, 0, sizeof(scm)); scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); sk_peek_offset_bwd(sk, skb->len); } else { /* It is questionable: on PEEK we could: - do not return fds - good, but too simple 8) - return fds, and do not return them on read (old strategy, apparently wrong) - clone fds (I chose it for now, it is the most universal solution) POSIX 1003.1g does not actually define this clearly at all. POSIX 1003.1g doesn't define a lot of things clearly however! */ sk_peek_offset_fwd(sk, size); if (UNIXCB(skb).fp) unix_peek_fds(&scm, skb); } err = (flags & MSG_TRUNC) ? skb->len - skip : size; scm_recv(sock, msg, &scm, flags); out_free: skb_free_datagram(sk, skb); mutex_unlock(&u->iolock); out: return err; } static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; #ifdef CONFIG_BPF_SYSCALL const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_dgram_proto) return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, NULL); #endif return __unix_dgram_recvmsg(sk, msg, size, flags); } static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor) { int copied = 0; while (1) { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int used, err; mutex_lock(&u->iolock); skb = skb_recv_datagram(sk, 0, 1, &err); mutex_unlock(&u->iolock); if (!skb) return err; used = recv_actor(desc, skb, 0, skb->len); if (used <= 0) { if (!copied) copied = used; kfree_skb(skb); break; } else if (used <= skb->len) { copied += used; } kfree_skb(skb); if (!desc->count) break; } return copied; } /* * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, struct sk_buff *last, unsigned int last_len, bool freezable) { struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); if (freezable) timeo = freezable_schedule_timeout(timeo); else timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) break; sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); unix_state_unlock(sk); return timeo; } static unsigned int unix_skb_len(const struct sk_buff *skb) { return skb->len - UNIXCB(skb).consumed; } struct unix_stream_read_state { int (*recv_actor)(struct sk_buff *, int, int, struct unix_stream_read_state *); struct socket *socket; struct msghdr *msg; struct pipe_inode_info *pipe; size_t size; int flags; unsigned int splice_flags; }; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int unix_stream_recv_urg(struct unix_stream_read_state *state) { struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int chunk = 1; struct sk_buff *oob_skb; mutex_lock(&u->iolock); unix_state_lock(sk); spin_lock(&sk->sk_receive_queue.lock); if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); mutex_unlock(&u->iolock); return -EINVAL; } oob_skb = u->oob_skb; if (!(state->flags & MSG_PEEK)) WRITE_ONCE(u->oob_skb, NULL); else skb_get(oob_skb); spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); chunk = state->recv_actor(oob_skb, 0, chunk, state); if (!(state->flags & MSG_PEEK)) UNIXCB(oob_skb).consumed += 1; consume_skb(oob_skb); mutex_unlock(&u->iolock); if (chunk < 0) return -EFAULT; state->msg->msg_flags |= MSG_OOB; return 1; } static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, int flags, int copied) { struct unix_sock *u = unix_sk(sk); if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); skb = NULL; } else { struct sk_buff *unlinked_skb = NULL; spin_lock(&sk->sk_receive_queue.lock); if (skb == u->oob_skb) { if (copied) { skb = NULL; } else if (!(flags & MSG_PEEK)) { if (sock_flag(sk, SOCK_URGINLINE)) { WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); } else { __skb_unlink(skb, &sk->sk_receive_queue); WRITE_ONCE(u->oob_skb, NULL); unlinked_skb = skb; skb = skb_peek(&sk->sk_receive_queue); } } else if (!sock_flag(sk, SOCK_URGINLINE)) { skb = skb_peek_next(skb, &sk->sk_receive_queue); } } spin_unlock(&sk->sk_receive_queue.lock); if (unlinked_skb) { WARN_ON_ONCE(skb_unref(unlinked_skb)); kfree_skb(unlinked_skb); } } return skb; } #endif static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor) { if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) return -ENOTCONN; return unix_read_sock(sk, desc, recv_actor); } static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { struct scm_cookie scm; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int copied = 0; int flags = state->flags; int noblock = flags & MSG_DONTWAIT; bool check_creds = false; int target; int err = 0; long timeo; int skip; size_t size = state->size; unsigned int last_len; if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { err = -EINVAL; goto out; } if (unlikely(flags & MSG_OOB)) { err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) err = unix_stream_recv_urg(state); #endif goto out; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); memset(&scm, 0, sizeof(scm)); /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ mutex_lock(&u->iolock); skip = max(sk_peek_offset(sk, flags), 0); do { int chunk; bool drop_skb; struct sk_buff *skb, *last; redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; goto unlock; } last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); if (!skb && copied) { unix_state_unlock(sk); break; } } #endif if (skb == NULL) { if (copied >= target) goto unlock; /* * POSIX 1003.1g mandates this order. */ err = sock_error(sk); if (err) goto unlock; if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; unix_state_unlock(sk); if (!timeo) { err = -EAGAIN; break; } mutex_unlock(&u->iolock); timeo = unix_stream_data_wait(sk, timeo, last, last_len, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); scm_destroy(&scm); goto out; } mutex_lock(&u->iolock); goto redo; unlock: unix_state_unlock(sk); break; } while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; } unix_state_unlock(sk); if (check_creds) { /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); check_creds = true; } /* Copy address just once */ if (state->msg && state->msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, state->msg->msg_name); unix_copy_addr(state->msg, skb->sk); sunaddr = NULL; } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); skb_get(skb); chunk = state->recv_actor(skb, skip, chunk, state); drop_skb = !unix_skb_len(skb); /* skb is only safe to use if !drop_skb */ consume_skb(skb); if (chunk < 0) { if (copied == 0) copied = -EFAULT; break; } copied += chunk; size -= chunk; if (drop_skb) { /* the skb was touched by a concurrent reader; * we should not expect anything from this skb * anymore and assume it invalid - we can be * sure it was dropped from the socket queue * * let's report a short read */ err = 0; break; } /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { UNIXCB(skb).consumed += chunk; sk_peek_offset_bwd(sk, chunk); if (UNIXCB(skb).fp) { scm_stat_del(sk, skb); unix_detach_fds(&scm, skb); } if (unix_skb_len(skb)) break; skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); if (scm.fp) break; } else { /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) unix_peek_fds(&scm, skb); sk_peek_offset_fwd(sk, chunk); if (UNIXCB(skb).fp) break; skip = 0; last = skb; last_len = skb->len; unix_state_lock(sk); skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb) goto again; unix_state_unlock(sk); break; } } while (size); mutex_unlock(&u->iolock); if (state->msg) scm_recv(sock, state->msg, &scm, flags); else scm_destroy(&scm); out: return copied ? : err; } static int unix_stream_read_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { int ret; ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, state->msg, chunk); return ret ?: chunk; } int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_read_actor, .socket = sk->sk_socket, .msg = msg, .size = size, .flags = flags }; return unix_stream_read_generic(&state, true); } static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_read_actor, .socket = sock, .msg = msg, .size = size, .flags = flags }; #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_stream_proto) return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, NULL); #endif return unix_stream_read_generic(&state, true); } static int unix_stream_splice_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { return skb_splice_bits(skb, state->socket->sk, UNIXCB(skb).consumed + skip, state->pipe, chunk, state->splice_flags); } static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t size, unsigned int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_splice_actor, .socket = sock, .pipe = pipe, .size = size, .splice_flags = flags, }; if (unlikely(*ppos)) return -ESPIPE; if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) state.flags = MSG_DONTWAIT; return unix_stream_read_generic(&state, false); } static int unix_shutdown(struct socket *sock, int mode) { struct sock *sk = sock->sk; struct sock *other; if (mode < SHUT_RD || mode > SHUT_RDWR) return -EINVAL; /* This maps: * SHUT_RD (0) -> RCV_SHUTDOWN (1) * SHUT_WR (1) -> SEND_SHUTDOWN (2) * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) */ ++mode; unix_state_lock(sk); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); other = unix_peer(sk); if (other) sock_hold(other); unix_state_unlock(sk); sk->sk_state_change(sk); if (other && (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { int peer_mode = 0; const struct proto *prot = READ_ONCE(other->sk_prot); if (prot->unhash) prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) peer_mode |= RCV_SHUTDOWN; unix_state_lock(other); WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); unix_state_unlock(other); other->sk_state_change(other); if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); } if (other) sock_put(other); return 0; } long unix_inq_len(struct sock *sk) { struct sk_buff *skb; long amount = 0; if (READ_ONCE(sk->sk_state) == TCP_LISTEN) return -EINVAL; spin_lock(&sk->sk_receive_queue.lock); if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) amount += unix_skb_len(skb); } else { skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; } spin_unlock(&sk->sk_receive_queue.lock); return amount; } EXPORT_SYMBOL_GPL(unix_inq_len); long unix_outq_len(struct sock *sk) { return sk_wmem_alloc_get(sk); } EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { struct path path; struct file *f; int fd; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!smp_load_acquire(&unix_sk(sk)->addr)) return -ENOENT; path = unix_sk(sk)->path; if (!path.dentry) return -ENOENT; path_get(&path); fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) goto out; f = dentry_open(&path, O_PATH, current_cred()); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); goto out; } fd_install(fd, f); out: path_put(&path); return fd; } static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; long amount = 0; int err; switch (cmd) { case SIOCOUTQ: amount = unix_outq_len(sk); err = put_user(amount, (int __user *)arg); break; case SIOCINQ: amount = unix_inq_len(sk); if (amount < 0) err = amount; else err = put_user(amount, (int __user *)arg); break; case SIOCUNIXFILE: err = unix_open_file(sk); break; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) case SIOCATMARK: { struct sk_buff *skb; int answ = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) answ = 1; err = put_user(answ, (int __user *)arg); } break; #endif default: err = -ENOIOCTLCMD; break; } return err; } #ifdef CONFIG_COMPAT static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } #endif static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (READ_ONCE(unix_sk(sk)->oob_skb)) mask |= EPOLLPRI; #endif /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && state == TCP_CLOSE) mask |= EPOLLHUP; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ if (unix_writable(sk, state)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk, *other; unsigned int writable; unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) mask |= EPOLLHUP; /* No write status requested, avoid expensive OUT tests. */ if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk, state); if (writable) { unix_state_lock(sk); other = unix_peer(sk); if (other && unix_peer(other) != sk && unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; unix_state_unlock(sk); } if (writable) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } #ifdef CONFIG_PROC_FS #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { unsigned long offset = get_offset(*pos); unsigned long bucket = get_bucket(*pos); struct sock *sk; unsigned long count = 0; for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) { if (sock_net(sk) != seq_file_net(seq)) continue; if (++count == offset) break; } return sk; } static struct sock *unix_next_socket(struct seq_file *seq, struct sock *sk, loff_t *pos) { unsigned long bucket; while (sk > (struct sock *)SEQ_START_TOKEN) { sk = sk_next(sk); if (!sk) goto next_bucket; if (sock_net(sk) == seq_file_net(seq)) return sk; } do { sk = unix_from_bucket(seq, pos); if (sk) return sk; next_bucket: bucket = get_bucket(*pos) + 1; *pos = set_bucket_offset(bucket, 1); } while (bucket < ARRAY_SIZE(unix_socket_table)); return NULL; } static void *unix_seq_start(struct seq_file *seq, loff_t *pos) __acquires(unix_table_lock) { spin_lock(&unix_table_lock); if (!*pos) return SEQ_START_TOKEN; if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table)) return NULL; return unix_next_socket(seq, NULL, pos); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return unix_next_socket(seq, v, pos); } static void unix_seq_stop(struct seq_file *seq, void *v) __releases(unix_table_lock) { spin_unlock(&unix_table_lock); } static int unix_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Num RefCount Protocol Flags Type St " "Inode Path\n"); else { struct sock *s = v; struct unix_sock *u = unix_sk(s); unix_state_lock(s); seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", s, refcount_read(&s->sk_refcnt), 0, s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, s->sk_type, s->sk_socket ? (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); if (u->addr) { // under unix_table_lock here int i, len; seq_putc(seq, ' '); i = 0; len = u->addr->len - sizeof(short); if (!UNIX_ABSTRACT(s)) len--; else { seq_putc(seq, '@'); i++; } for ( ; i < len; i++) seq_putc(seq, u->addr->name->sun_path[i] ?: '@'); } unix_state_unlock(s); seq_putc(seq, '\n'); } return 0; } static const struct seq_operations unix_seq_ops = { .start = unix_seq_start, .next = unix_seq_next, .stop = unix_seq_stop, .show = unix_seq_show, }; #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) struct bpf_iter__unix { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct unix_sock *, unix_sk); uid_t uid __aligned(8); }; static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) { struct bpf_iter__unix ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.unix_sk = unix_sk; ctx.uid = uid; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; if (v == SEQ_START_TOKEN) return 0; uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); return unix_prog_seq_show(prog, &meta, v, uid); } static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)unix_prog_seq_show(prog, &meta, v, 0); } unix_seq_stop(seq, v); } static const struct seq_operations bpf_iter_unix_seq_ops = { .start = unix_seq_start, .next = unix_seq_next, .stop = bpf_iter_unix_seq_stop, .show = bpf_iter_unix_seq_show, }; #endif #endif static const struct net_proto_family unix_family_ops = { .family = PF_UNIX, .create = unix_create, .owner = THIS_MODULE, }; static int __net_init unix_net_init(struct net *net) { int error = -ENOMEM; net->unx.sysctl_max_dgram_qlen = 10; if (unix_sysctl_register(net)) goto out; #ifdef CONFIG_PROC_FS if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, sizeof(struct seq_net_private))) { unix_sysctl_unregister(net); goto out; } #endif error = 0; out: return error; } static void __net_exit unix_net_exit(struct net *net) { unix_sysctl_unregister(net); remove_proc_entry("unix", net->proc_net); } static struct pernet_operations unix_net_ops = { .init = unix_net_init, .exit = unix_net_exit, }; #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) static const struct bpf_iter_seq_info unix_seq_info = { .seq_ops = &bpf_iter_unix_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct seq_net_private), }; static struct bpf_iter_reg unix_reg_info = { .target = "unix", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__unix, unix_sk), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &unix_seq_info, }; static void __init bpf_iter_register(void) { unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; if (bpf_iter_reg_target(&unix_reg_info)) pr_warn("Warning: could not register bpf iterator unix\n"); } #endif static int __init af_unix_init(void) { int rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); rc = proto_register(&unix_dgram_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; } rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); proto_unregister(&unix_dgram_proto); goto out; } sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); unix_bpf_build_proto(); #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_register(); #endif out: return rc; } static void __exit af_unix_exit(void) { sock_unregister(PF_UNIX); proto_unregister(&unix_dgram_proto); proto_unregister(&unix_stream_proto); unregister_pernet_subsys(&unix_net_ops); } /* Earlier than device_initcall() so that other drivers invoking request_module() don't end up in a loop when modprobe tries to use a UNIX socket. But later than subsys_initcall() because we depend on stuff initialised there */ fs_initcall(af_unix_init); module_exit(af_unix_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_UNIX); |
8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 | // SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/async.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/delay.h> #include <linux/string.h> #include <linux/dirent.h> #include <linux/syscalls.h> #include <linux/utime.h> #include <linux/file.h> #include <linux/memblock.h> #include <linux/mm.h> #include <linux/namei.h> #include <linux/init_syscalls.h> #include <linux/umh.h> static ssize_t __init xwrite(struct file *file, const char *p, size_t count, loff_t *pos) { ssize_t out = 0; /* sys_write only can write MAX_RW_COUNT aka 2G-4K bytes at most */ while (count) { ssize_t rv = kernel_write(file, p, count, pos); if (rv < 0) { if (rv == -EINTR || rv == -EAGAIN) continue; return out ? out : rv; } else if (rv == 0) break; p += rv; out += rv; count -= rv; } return out; } static __initdata char *message; static void __init error(char *x) { if (!message) message = x; } static void panic_show_mem(const char *fmt, ...) { va_list args; show_mem(0, NULL); va_start(args, fmt); panic(fmt, args); va_end(args); } /* link hash */ #define N_ALIGN(len) ((((len) + 1) & ~3) + 2) static __initdata struct hash { int ino, minor, major; umode_t mode; struct hash *next; char name[N_ALIGN(PATH_MAX)]; } *head[32]; static inline int hash(int major, int minor, int ino) { unsigned long tmp = ino + minor + (major << 3); tmp += tmp >> 5; return tmp & 31; } static char __init *find_link(int major, int minor, int ino, umode_t mode, char *name) { struct hash **p, *q; for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) { if ((*p)->ino != ino) continue; if ((*p)->minor != minor) continue; if ((*p)->major != major) continue; if (((*p)->mode ^ mode) & S_IFMT) continue; return (*p)->name; } q = kmalloc(sizeof(struct hash), GFP_KERNEL); if (!q) panic_show_mem("can't allocate link hash entry"); q->major = major; q->minor = minor; q->ino = ino; q->mode = mode; strcpy(q->name, name); q->next = NULL; *p = q; return NULL; } static void __init free_hash(void) { struct hash **p, *q; for (p = head; p < head + 32; p++) { while (*p) { q = *p; *p = q->next; kfree(q); } } } static long __init do_utime(char *filename, time64_t mtime) { struct timespec64 t[2]; t[0].tv_sec = mtime; t[0].tv_nsec = 0; t[1].tv_sec = mtime; t[1].tv_nsec = 0; return init_utimes(filename, t); } static __initdata LIST_HEAD(dir_list); struct dir_entry { struct list_head list; char *name; time64_t mtime; }; static void __init dir_add(const char *name, time64_t mtime) { struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); if (!de) panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); de->name = kstrdup(name, GFP_KERNEL); de->mtime = mtime; list_add(&de->list, &dir_list); } static void __init dir_utime(void) { struct dir_entry *de, *tmp; list_for_each_entry_safe(de, tmp, &dir_list, list) { list_del(&de->list); do_utime(de->name, de->mtime); kfree(de->name); kfree(de); } } static __initdata time64_t mtime; /* cpio header parsing */ static __initdata unsigned long ino, major, minor, nlink; static __initdata umode_t mode; static __initdata unsigned long body_len, name_len; static __initdata uid_t uid; static __initdata gid_t gid; static __initdata unsigned rdev; static void __init parse_header(char *s) { unsigned long parsed[12]; char buf[9]; int i; buf[8] = '\0'; for (i = 0, s += 6; i < 12; i++, s += 8) { memcpy(buf, s, 8); parsed[i] = simple_strtoul(buf, NULL, 16); } ino = parsed[0]; mode = parsed[1]; uid = parsed[2]; gid = parsed[3]; nlink = parsed[4]; mtime = parsed[5]; /* breaks in y2106 */ body_len = parsed[6]; major = parsed[7]; minor = parsed[8]; rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); name_len = parsed[11]; } /* FSM */ static __initdata enum state { Start, Collect, GotHeader, SkipIt, GotName, CopyFile, GotSymlink, Reset } state, next_state; static __initdata char *victim; static unsigned long byte_count __initdata; static __initdata loff_t this_header, next_header; static inline void __init eat(unsigned n) { victim += n; this_header += n; byte_count -= n; } static __initdata char *collected; static long remains __initdata; static __initdata char *collect; static void __init read_into(char *buf, unsigned size, enum state next) { if (byte_count >= size) { collected = victim; eat(size); state = next; } else { collect = collected = buf; remains = size; next_state = next; state = Collect; } } static __initdata char *header_buf, *symlink_buf, *name_buf; static int __init do_start(void) { read_into(header_buf, 110, GotHeader); return 0; } static int __init do_collect(void) { unsigned long n = remains; if (byte_count < n) n = byte_count; memcpy(collect, victim, n); eat(n); collect += n; if ((remains -= n) != 0) return 1; state = next_state; return 0; } static int __init do_header(void) { if (memcmp(collected, "070707", 6)==0) { error("incorrect cpio method used: use -H newc option"); return 1; } if (memcmp(collected, "070701", 6)) { error("no cpio magic"); return 1; } parse_header(collected); next_header = this_header + N_ALIGN(name_len) + body_len; next_header = (next_header + 3) & ~3; state = SkipIt; if (name_len <= 0 || name_len > PATH_MAX) return 0; if (S_ISLNK(mode)) { if (body_len > PATH_MAX) return 0; collect = collected = symlink_buf; remains = N_ALIGN(name_len) + body_len; next_state = GotSymlink; state = Collect; return 0; } if (S_ISREG(mode) || !body_len) read_into(name_buf, N_ALIGN(name_len), GotName); return 0; } static int __init do_skip(void) { if (this_header + byte_count < next_header) { eat(byte_count); return 1; } else { eat(next_header - this_header); state = next_state; return 0; } } static int __init do_reset(void) { while (byte_count && *victim == '\0') eat(1); if (byte_count && (this_header & 3)) error("broken padding"); return 1; } static void __init clean_path(char *path, umode_t fmode) { struct kstat st; if (!init_stat(path, &st, AT_SYMLINK_NOFOLLOW) && (st.mode ^ fmode) & S_IFMT) { if (S_ISDIR(st.mode)) init_rmdir(path); else init_unlink(path); } } static int __init maybe_link(void) { if (nlink >= 2) { char *old = find_link(major, minor, ino, mode, collected); if (old) { clean_path(collected, 0); return (init_link(old, collected) < 0) ? -1 : 1; } } return 0; } static __initdata struct file *wfile; static __initdata loff_t wfile_pos; static int __init do_name(void) { state = SkipIt; next_state = Reset; /* name_len > 0 && name_len <= PATH_MAX checked in do_header */ if (collected[name_len - 1] != '\0') { pr_err("initramfs name without nulterm: %.*s\n", (int)name_len, collected); error("malformed archive"); return 1; } if (strcmp(collected, "TRAILER!!!") == 0) { free_hash(); return 0; } clean_path(collected, mode); if (S_ISREG(mode)) { int ml = maybe_link(); if (ml >= 0) { int openflags = O_WRONLY|O_CREAT; if (ml != 1) openflags |= O_TRUNC; wfile = filp_open(collected, openflags, mode); if (IS_ERR(wfile)) return 0; wfile_pos = 0; vfs_fchown(wfile, uid, gid); vfs_fchmod(wfile, mode); if (body_len) vfs_truncate(&wfile->f_path, body_len); state = CopyFile; } } else if (S_ISDIR(mode)) { init_mkdir(collected, mode); init_chown(collected, uid, gid, 0); init_chmod(collected, mode); dir_add(collected, mtime); } else if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { if (maybe_link() == 0) { init_mknod(collected, mode, rdev); init_chown(collected, uid, gid, 0); init_chmod(collected, mode); do_utime(collected, mtime); } } return 0; } static int __init do_copy(void) { if (byte_count >= body_len) { struct timespec64 t[2] = { }; if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len) error("write error"); t[0].tv_sec = mtime; t[1].tv_sec = mtime; vfs_utimes(&wfile->f_path, t); fput(wfile); eat(body_len); state = SkipIt; return 0; } else { if (xwrite(wfile, victim, byte_count, &wfile_pos) != byte_count) error("write error"); body_len -= byte_count; eat(byte_count); return 1; } } static int __init do_symlink(void) { if (collected[name_len - 1] != '\0') { pr_err("initramfs symlink without nulterm: %.*s\n", (int)name_len, collected); error("malformed archive"); return 1; } collected[N_ALIGN(name_len) + body_len] = '\0'; clean_path(collected, 0); init_symlink(collected + N_ALIGN(name_len), collected); init_chown(collected, uid, gid, AT_SYMLINK_NOFOLLOW); do_utime(collected, mtime); state = SkipIt; next_state = Reset; return 0; } static __initdata int (*actions[])(void) = { [Start] = do_start, [Collect] = do_collect, [GotHeader] = do_header, [SkipIt] = do_skip, [GotName] = do_name, [CopyFile] = do_copy, [GotSymlink] = do_symlink, [Reset] = do_reset, }; static long __init write_buffer(char *buf, unsigned long len) { byte_count = len; victim = buf; while (!actions[state]()) ; return len - byte_count; } static long __init flush_buffer(void *bufv, unsigned long len) { char *buf = (char *) bufv; long written; long origLen = len; if (message) return -1; while ((written = write_buffer(buf, len)) < len && !message) { char c = buf[written]; if (c == '0') { buf += written; len -= written; state = Start; } else if (c == 0) { buf += written; len -= written; state = Reset; } else error("junk within compressed archive"); } return origLen; } static unsigned long my_inptr; /* index of next byte to be processed in inbuf */ #include <linux/decompress/generic.h> static char * __init unpack_to_rootfs(char *buf, unsigned long len) { long written; decompress_fn decompress; const char *compress_name; static __initdata char msg_buf[64]; header_buf = kmalloc(110, GFP_KERNEL); symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); if (!header_buf || !symlink_buf || !name_buf) panic_show_mem("can't allocate buffers"); state = Start; this_header = 0; message = NULL; while (!message && len) { loff_t saved_offset = this_header; if (*buf == '0' && !(this_header & 3)) { state = Start; written = write_buffer(buf, len); buf += written; len -= written; continue; } if (!*buf) { buf++; len--; this_header++; continue; } this_header = 0; decompress = decompress_method(buf, len, &compress_name); pr_debug("Detected %s compressed data\n", compress_name); if (decompress) { int res = decompress(buf, len, NULL, flush_buffer, NULL, &my_inptr, error); if (res) error("decompressor failed"); } else if (compress_name) { if (!message) { snprintf(msg_buf, sizeof msg_buf, "compression method %s not configured", compress_name); message = msg_buf; } } else error("invalid magic at start of compressed archive"); if (state != Reset) error("junk at the end of compressed archive"); this_header = saved_offset + my_inptr; buf += my_inptr; len -= my_inptr; } dir_utime(); kfree(name_buf); kfree(symlink_buf); kfree(header_buf); return message; } static int __initdata do_retain_initrd; static int __init retain_initrd_param(char *str) { if (*str) return 0; do_retain_initrd = 1; return 1; } __setup("retain_initrd", retain_initrd_param); #ifdef CONFIG_ARCH_HAS_KEEPINITRD static int __init keepinitrd_setup(char *__unused) { do_retain_initrd = 1; return 1; } __setup("keepinitrd", keepinitrd_setup); #endif static bool __initdata initramfs_async = true; static int __init initramfs_async_setup(char *str) { strtobool(str, &initramfs_async); return 1; } __setup("initramfs_async=", initramfs_async_setup); extern char __initramfs_start[]; extern unsigned long __initramfs_size; #include <linux/initrd.h> #include <linux/kexec.h> void __init reserve_initrd_mem(void) { phys_addr_t start; unsigned long size; /* Ignore the virtul address computed during device tree parsing */ initrd_start = initrd_end = 0; if (!phys_initrd_size) return; /* * Round the memory region to page boundaries as per free_initrd_mem() * This allows us to detect whether the pages overlapping the initrd * are in use, but more importantly, reserves the entire set of pages * as we don't want these pages allocated for other purposes. */ start = round_down(phys_initrd_start, PAGE_SIZE); size = phys_initrd_size + (phys_initrd_start - start); size = round_up(size, PAGE_SIZE); if (!memblock_is_region_memory(start, size)) { pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region", (u64)start, size); goto disable; } if (memblock_is_region_reserved(start, size)) { pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region\n", (u64)start, size); goto disable; } memblock_reserve(start, size); /* Now convert initrd to virtual addresses */ initrd_start = (unsigned long)__va(phys_initrd_start); initrd_end = initrd_start + phys_initrd_size; initrd_below_start_ok = 1; return; disable: pr_cont(" - disabling initrd\n"); initrd_start = 0; initrd_end = 0; } void __weak __init free_initrd_mem(unsigned long start, unsigned long end) { #ifdef CONFIG_ARCH_KEEP_MEMBLOCK unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE); unsigned long aligned_end = ALIGN(end, PAGE_SIZE); memblock_free(__pa(aligned_start), aligned_end - aligned_start); #endif free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, "initrd"); } #ifdef CONFIG_KEXEC_CORE static bool __init kexec_free_initrd(void) { unsigned long crashk_start = (unsigned long)__va(crashk_res.start); unsigned long crashk_end = (unsigned long)__va(crashk_res.end); /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ if (initrd_start >= crashk_end || initrd_end <= crashk_start) return false; /* * Initialize initrd memory region since the kexec boot does not do. */ memset((void *)initrd_start, 0, initrd_end - initrd_start); if (initrd_start < crashk_start) free_initrd_mem(initrd_start, crashk_start); if (initrd_end > crashk_end) free_initrd_mem(crashk_end, initrd_end); return true; } #else static inline bool kexec_free_initrd(void) { return false; } #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_BLK_DEV_RAM static void __init populate_initrd_image(char *err) { ssize_t written; struct file *file; loff_t pos = 0; unpack_to_rootfs(__initramfs_start, __initramfs_size); printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", err); file = filp_open("/initrd.image", O_WRONLY|O_CREAT|O_LARGEFILE, 0700); if (IS_ERR(file)) return; written = xwrite(file, (char *)initrd_start, initrd_end - initrd_start, &pos); if (written != initrd_end - initrd_start) pr_err("/initrd.image: incomplete write (%zd != %ld)\n", written, initrd_end - initrd_start); fput(file); } #endif /* CONFIG_BLK_DEV_RAM */ static void __init do_populate_rootfs(void *unused, async_cookie_t cookie) { /* Load the built in initramfs */ char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic_show_mem("%s", err); /* Failed to decompress INTERNAL initramfs */ if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) goto done; if (IS_ENABLED(CONFIG_BLK_DEV_RAM)) printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); else printk(KERN_INFO "Unpacking initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); if (err) { #ifdef CONFIG_BLK_DEV_RAM populate_initrd_image(err); #else printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); #endif } done: /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) free_initrd_mem(initrd_start, initrd_end); initrd_start = 0; initrd_end = 0; flush_delayed_fput(); } static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain); static async_cookie_t initramfs_cookie; void wait_for_initramfs(void) { if (!initramfs_cookie) { /* * Something before rootfs_initcall wants to access * the filesystem/initramfs. Probably a bug. Make a * note, avoid deadlocking the machine, and let the * caller's access fail as it used to. */ pr_warn_once("wait_for_initramfs() called before rootfs_initcalls\n"); return; } async_synchronize_cookie_domain(initramfs_cookie + 1, &initramfs_domain); } EXPORT_SYMBOL_GPL(wait_for_initramfs); static int __init populate_rootfs(void) { initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL, &initramfs_domain); usermodehelper_enable(); if (!initramfs_async) wait_for_initramfs(); return 0; } rootfs_initcall(populate_rootfs); |
4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 | // SPDX-License-Identifier: GPL-2.0 /* * blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/blk-mq.h> #include <linux/list_sort.h> #include <trace/events/block.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-wbt.h" void blk_mq_sched_assign_ioc(struct request *rq) { struct request_queue *q = rq->q; struct io_context *ioc; struct io_cq *icq; /* * May not have an IO context if it's a passthrough request */ ioc = current->io_context; if (!ioc) return; spin_lock_irq(&q->queue_lock); icq = ioc_lookup_icq(ioc, q); spin_unlock_irq(&q->queue_lock); if (!icq) { icq = ioc_create_icq(ioc, q, GFP_ATOMIC); if (!icq) return; } get_io_context(icq->ioc); rq->elv.icq = icq; } /* * Mark a hardware queue as needing a restart. */ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return; set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return; clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch) * in blk_mq_run_hw_queue(). Its pair is the barrier in * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART, * meantime new request added to hctx->dispatch is missed to check in * blk_mq_run_hw_queue(). */ smp_mb(); blk_mq_run_hw_queue(hctx, true); } static int sched_rq_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); return rqa->mq_hctx > rqb->mq_hctx; } static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) { struct blk_mq_hw_ctx *hctx = list_first_entry(rq_list, struct request, queuelist)->mq_hctx; struct request *rq; LIST_HEAD(hctx_list); unsigned int count = 0; list_for_each_entry(rq, rq_list, queuelist) { if (rq->mq_hctx != hctx) { list_cut_before(&hctx_list, rq_list, &rq->queuelist); goto dispatch; } count++; } list_splice_tail_init(rq_list, &hctx_list); dispatch: return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); } #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. */ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; bool multi_hctxs = false, run_queue = false; bool dispatched = false, busy = false; unsigned int max_dispatch; LIST_HEAD(rq_list); int count = 0; if (hctx->dispatch_busy) max_dispatch = 1; else max_dispatch = hctx->queue->nr_requests; do { struct request *rq; int budget_token; if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; if (!list_empty_careful(&hctx->dispatch)) { busy = true; break; } budget_token = blk_mq_get_dispatch_budget(q); if (budget_token < 0) break; rq = e->type->ops.dispatch_request(hctx); if (!rq) { blk_mq_put_dispatch_budget(q, budget_token); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the * same queue and if we didn't dispatch then there's * no guarantee anyone will kick the queue. Kick it * ourselves. */ run_queue = true; break; } blk_mq_set_rq_budget_token(rq, budget_token); /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ list_add_tail(&rq->queuelist, &rq_list); count++; if (rq->mq_hctx != hctx) multi_hctxs = true; /* * If we cannot get tag for the request, stop dequeueing * requests from the IO scheduler. We are unlikely to be able * to submit them anyway and it creates false impression for * scheduling heuristics that the device can take more IO. */ if (!blk_mq_get_driver_tag(rq)) break; } while (count < max_dispatch); if (!count) { if (run_queue) blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); } else if (multi_hctxs) { /* * Requests from different hctx may be dequeued from some * schedulers, such as bfq and deadline. * * Sort the requests in the list according to their hctx, * dispatch batching requests from same hctx at a time. */ list_sort(NULL, &rq_list, sched_rq_cmp); do { dispatched |= blk_mq_dispatch_hctx_list(&rq_list); } while (!list_empty(&rq_list)); } else { dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); } if (busy) return -EAGAIN; return !!dispatched; } static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { unsigned long end = jiffies + HZ; int ret; do { ret = __blk_mq_do_dispatch_sched(hctx); if (ret != 1) break; if (need_resched() || time_is_before_jiffies(end)) { blk_mq_delay_run_hw_queue(hctx, 0); break; } } while (1); return ret; } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { unsigned short idx = ctx->index_hw[hctx->type]; if (++idx == hctx->nr_ctx) idx = 0; return hctx->ctxs[idx]; } /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. */ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); int ret = 0; struct request *rq; do { int budget_token; if (!list_empty_careful(&hctx->dispatch)) { ret = -EAGAIN; break; } if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; budget_token = blk_mq_get_dispatch_budget(q); if (budget_token < 0) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { blk_mq_put_dispatch_budget(q, budget_token); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the * same queue and if we didn't dispatch then there's * no guarantee anyone will kick the queue. Kick it * ourselves. */ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); break; } blk_mq_set_rq_budget_token(rq, budget_token); /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ list_add(&rq->queuelist, &rq_list); /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; } static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; const bool has_sched = q->elevator; int ret = 0; LIST_HEAD(rq_list); /* * If we have previous entries on our dispatch list, grab them first for * more fair dispatch. */ if (!list_empty_careful(&hctx->dispatch)) { spin_lock(&hctx->lock); if (!list_empty(&hctx->dispatch)) list_splice_init(&hctx->dispatch, &rq_list); spin_unlock(&hctx->lock); } /* * Only ask the scheduler for requests, if we didn't have residual * requests from the dispatch list. This is to avoid the case where * we only ever dispatch a fraction of the requests available because * of low device queue depth. Once we pull requests out of the IO * scheduler, we can no longer merge or sort them. So it's best to * leave them there for as long as we can. Mark the hw queue as * needing a restart in that case. * * We want to dispatch from the scheduler if there was nothing * on the dispatch list or we were able to dispatch from the * dispatch list. */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { if (has_sched) ret = blk_mq_do_dispatch_sched(hctx); else ret = blk_mq_do_dispatch_ctx(hctx); } } else if (has_sched) { ret = blk_mq_do_dispatch_sched(hctx); } else if (hctx->dispatch_busy) { /* dequeue request one by one from sw queue if queue is busy */ ret = blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(hctx, &rq_list, 0); } return ret; } void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; /* RCU or SRCU read lock is needed before checking quiesced flag */ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; hctx->run++; /* * A return of -EAGAIN is an indication that hctx->dispatch is not * empty and we must run again in order to avoid starving flushes. */ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) blk_mq_run_hw_queue(hctx, true); } } bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; bool ret = false; enum hctx_type type; if (e && e->type->ops.bio_merge) return e->type->ops.bio_merge(q, bio, nr_segs); ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); type = hctx->type; if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || list_empty_careful(&ctx->rq_lists[type])) return false; /* default per sw-queue merge */ spin_lock(&ctx->lock); /* * Reverse check our software queue for entries that we could * potentially merge with. Currently includes a hand-wavy stop * count of 8, to not spend too much time checking for merges. */ if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { ctx->rq_merged++; ret = true; } spin_unlock(&ctx->lock); return ret; } bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free) { return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq) { /* * dispatch flush and passthrough rq directly * * passthrough request has to be added to hctx->dispatch directly. * For some reason, device may be in one situation which can't * handle FS request, so STS_RESOURCE is always returned and the * FS request will be added to hctx->dispatch. However passthrough * request may be required at that time for fixing the problem. If * passthrough request is added to scheduler queue, there isn't any * chance to dispatch it given we prioritize requests in hctx->dispatch. */ if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq)) return true; return false; } void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG)); if (blk_mq_sched_bypass_insert(hctx, rq)) { /* * Firstly normal IO request is inserted to scheduler queue or * sw queue, meantime we add flush request to dispatch queue( * hctx->dispatch) directly and there is at most one in-flight * flush request for each hw queue, so it doesn't matter to add * flush request to tail or front of the dispatch queue. * * Secondly in case of NCQ, flush request belongs to non-NCQ * command, and queueing it will fail when there is any * in-flight normal IO request(NCQ command). When adding flush * rq to the front of hctx->dispatch, it is easier to introduce * extra time to flush rq's latency because of S_SCHED_RESTART * compared with adding to the tail of dispatch queue, then * chance of flush merge is increased, and less flush requests * will be issued to controller. It is observed that ~10% time * is saved in blktests block/004 on disk attached to AHCI/NCQ * drive when adding flush rq to the front of hctx->dispatch. * * Simply queue flush rq to the front of hctx->dispatch so that * intensive flush workloads can benefit in case of NCQ HW. */ at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head; blk_mq_request_bypass_insert(rq, at_head, false); goto run; } if (e) { LIST_HEAD(list); list_add(&rq->queuelist, &list); e->type->ops.insert_requests(hctx, &list, at_head); } else { spin_lock(&ctx->lock); __blk_mq_insert_request(hctx, rq, at_head); spin_unlock(&ctx->lock); } run: if (run_queue) blk_mq_run_hw_queue(hctx, async); } void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async) { struct elevator_queue *e; struct request_queue *q = hctx->queue; /* * blk_mq_sched_insert_requests() is called from flush plug * context only, and hold one usage counter to prevent queue * from being released. */ percpu_ref_get(&q->q_usage_counter); e = hctx->queue->elevator; if (e) { e->type->ops.insert_requests(hctx, list, false); } else { /* * try to issue requests directly if the hw queue isn't * busy in case of 'none' scheduler, and this way may save * us one extra enqueue & dequeue to sw queue. */ if (!hctx->dispatch_busy && !e && !run_queue_async) { blk_mq_try_issue_list_directly(hctx, list); if (list_empty(list)) goto out; } blk_mq_insert_requests(hctx, ctx, list); } blk_mq_run_hw_queue(hctx, run_queue_async); out: percpu_ref_put(&q->q_usage_counter); } static int blk_mq_sched_alloc_tags(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct blk_mq_tag_set *set = q->tag_set; int ret; hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, set->reserved_tags, set->flags); if (!hctx->sched_tags) return -ENOMEM; ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); if (ret) { blk_mq_free_rq_map(hctx->sched_tags, set->flags); hctx->sched_tags = NULL; } return ret; } /* called in queue's release handler, tagset has gone away */ static void blk_mq_sched_tags_teardown(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) { blk_mq_free_rq_map(hctx->sched_tags, hctx->flags); hctx->sched_tags = NULL; } } } static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) { struct blk_mq_tag_set *set = queue->tag_set; int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); struct blk_mq_hw_ctx *hctx; int ret, i; /* * Set initial depth at max so that we don't need to reallocate for * updating nr_requests. */ ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, &queue->sched_breserved_tags, MAX_SCHED_RQ, set->reserved_tags, set->numa_node, alloc_policy); if (ret) return ret; queue_for_each_hw_ctx(queue, hctx, i) { hctx->sched_tags->bitmap_tags = &queue->sched_bitmap_tags; hctx->sched_tags->breserved_tags = &queue->sched_breserved_tags; } sbitmap_queue_resize(&queue->sched_bitmap_tags, queue->nr_requests - set->reserved_tags); return 0; } static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) { sbitmap_queue_free(&queue->sched_bitmap_tags); sbitmap_queue_free(&queue->sched_breserved_tags); } int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; unsigned int i; int ret; if (!e) { q->elevator = NULL; q->nr_requests = q->tag_set->queue_depth; return 0; } /* * Default to double of smaller one between hw queue_depth and 128, * since we don't split into sync/async like the old code did. * Additionally, this is a per-hw queue depth. */ q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, BLKDEV_MAX_RQ); queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_sched_alloc_tags(q, hctx, i); if (ret) goto err_free_tags; } if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { ret = blk_mq_init_sched_shared_sbitmap(q); if (ret) goto err_free_tags; } ret = e->ops.init_sched(q, e); if (ret) goto err_free_sbitmap; blk_mq_debugfs_register_sched(q); queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { eq = q->elevator; blk_mq_sched_free_requests(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; } } blk_mq_debugfs_register_sched_hctx(q, hctx); } return 0; err_free_sbitmap: if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) blk_mq_exit_sched_shared_sbitmap(q); err_free_tags: blk_mq_sched_free_requests(q); blk_mq_sched_tags_teardown(q); q->elevator = NULL; return ret; } /* * called in either blk_queue_cleanup or elevator_switch, tagset * is required for freeing requests */ void blk_mq_sched_free_requests(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); } } void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; unsigned int i; unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { blk_mq_debugfs_unregister_sched_hctx(hctx); if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } flags = hctx->flags; } blk_mq_debugfs_unregister_sched(q); if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); if (blk_mq_is_sbitmap_shared(flags)) blk_mq_exit_sched_shared_sbitmap(q); q->elevator = NULL; } |
9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/netclassid_cgroup.c Classid Cgroupfs Handling * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fdtable.h> #include <linux/sched/task.h> #include <net/cls_cgroup.h> #include <net/sock.h> static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cgroup_cls_state, css) : NULL; } struct cgroup_cls_state *task_cls_state(struct task_struct *p) { return css_cls_state(task_css_check(p, net_cls_cgrp_id, rcu_read_lock_bh_held())); } EXPORT_SYMBOL_GPL(task_cls_state); static struct cgroup_subsys_state * cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_cls_state *cs; cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); return &cs->css; } static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_cls_state *cs = css_cls_state(css); struct cgroup_cls_state *parent = css_cls_state(css->parent); if (parent) cs->classid = parent->classid; return 0; } static void cgrp_css_free(struct cgroup_subsys_state *css) { kfree(css_cls_state(css)); } /* * To avoid freezing of sockets creation for tasks with big number of threads * and opened sockets lets release file_lock every 1000 iterated descriptors. * New sockets will already have been created with new classid. */ struct update_classid_context { u32 classid; unsigned int batch; }; #define UPDATE_CLASSID_BATCH 1000 static int update_classid_sock(const void *v, struct file *file, unsigned n) { struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file); if (sock) sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); if (--ctx->batch == 0) { ctx->batch = UPDATE_CLASSID_BATCH; return n + 1; } return 0; } static void update_classid_task(struct task_struct *p, u32 classid) { struct update_classid_context ctx = { .classid = classid, .batch = UPDATE_CLASSID_BATCH }; unsigned int fd = 0; do { task_lock(p); fd = iterate_fd(p->files, fd, update_classid_sock, &ctx); task_unlock(p); cond_resched(); } while (fd); } static void cgrp_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct task_struct *p; cgroup_taskset_for_each(p, css, tset) { update_classid_task(p, css_cls_state(css)->classid); } } static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) { return css_cls_state(css)->classid; } static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, u64 value) { struct cgroup_cls_state *cs = css_cls_state(css); struct css_task_iter it; struct task_struct *p; cs->classid = (u32)value; css_task_iter_start(css, 0, &it); while ((p = css_task_iter_next(&it))) update_classid_task(p, cs->classid); css_task_iter_end(&it); return 0; } static struct cftype ss_files[] = { { .name = "classid", .read_u64 = read_classid, .write_u64 = write_classid, }, { } /* terminate */ }; struct cgroup_subsys net_cls_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = cgrp_attach, .legacy_cftypes = ss_files, }; |
7 7 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Point-to-Point Tunneling Protocol for Linux * * Authors: Dmitry Kozlov <xeb@mail.ru> */ #include <linux/string.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/ppp_channel.h> #include <linux/ppp_defs.h> #include <linux/if_pppox.h> #include <linux/ppp-ioctl.h> #include <linux/notifier.h> #include <linux/file.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/rcupdate.h> #include <linux/security.h> #include <linux/spinlock.h> #include <net/sock.h> #include <net/protocol.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/gre.h> #include <net/pptp.h> #include <linux/uaccess.h> #define PPTP_DRIVER_VERSION "0.8.5" #define MAX_CALLID 65535 static DECLARE_BITMAP(callid_bitmap, MAX_CALLID + 1); static struct pppox_sock __rcu **callid_sock; static DEFINE_SPINLOCK(chan_lock); static struct proto pptp_sk_proto __read_mostly; static const struct ppp_channel_ops pptp_chan_ops; static const struct proto_ops pptp_ops; static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr) { struct pppox_sock *sock; struct pptp_opt *opt; rcu_read_lock(); sock = rcu_dereference(callid_sock[call_id]); if (sock) { opt = &sock->proto.pptp; if (opt->dst_addr.sin_addr.s_addr != s_addr) sock = NULL; else sock_hold(sk_pppox(sock)); } rcu_read_unlock(); return sock; } static int lookup_chan_dst(u16 call_id, __be32 d_addr) { struct pppox_sock *sock; struct pptp_opt *opt; int i; rcu_read_lock(); i = 1; for_each_set_bit_from(i, callid_bitmap, MAX_CALLID) { sock = rcu_dereference(callid_sock[i]); if (!sock) continue; opt = &sock->proto.pptp; if (opt->dst_addr.call_id == call_id && opt->dst_addr.sin_addr.s_addr == d_addr) break; } rcu_read_unlock(); return i < MAX_CALLID; } static int add_chan(struct pppox_sock *sock, struct pptp_addr *sa) { static int call_id; spin_lock(&chan_lock); if (!sa->call_id) { call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, call_id + 1); if (call_id == MAX_CALLID) { call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, 1); if (call_id == MAX_CALLID) goto out_err; } sa->call_id = call_id; } else if (test_bit(sa->call_id, callid_bitmap)) { goto out_err; } sock->proto.pptp.src_addr = *sa; set_bit(sa->call_id, callid_bitmap); rcu_assign_pointer(callid_sock[sa->call_id], sock); spin_unlock(&chan_lock); return 0; out_err: spin_unlock(&chan_lock); return -1; } static void del_chan(struct pppox_sock *sock) { spin_lock(&chan_lock); clear_bit(sock->proto.pptp.src_addr.call_id, callid_bitmap); RCU_INIT_POINTER(callid_sock[sock->proto.pptp.src_addr.call_id], NULL); spin_unlock(&chan_lock); } static struct rtable *pptp_route_output(struct pppox_sock *po, struct flowi4 *fl4) { struct sock *sk = &po->sk; struct net *net; net = sock_net(sk); flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 0, RT_SCOPE_UNIVERSE, IPPROTO_GRE, 0, po->proto.pptp.dst_addr.sin_addr.s_addr, po->proto.pptp.src_addr.sin_addr.s_addr, 0, 0, sock_net_uid(net, sk)); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = (struct sock *) chan->private; struct pppox_sock *po = pppox_sk(sk); struct net *net = sock_net(sk); struct pptp_opt *opt = &po->proto.pptp; struct pptp_gre_header *hdr; unsigned int header_len = sizeof(*hdr); struct flowi4 fl4; int islcp; int len; unsigned char *data; __u32 seq_recv; struct rtable *rt; struct net_device *tdev; struct iphdr *iph; int max_headroom; if (sk_pppox(po)->sk_state & PPPOX_DEAD) goto tx_error; rt = pptp_route_output(po, &fl4); if (IS_ERR(rt)) goto tx_error; tdev = rt->dst.dev; max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(*iph) + sizeof(*hdr) + 2; if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); goto tx_error; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } data = skb->data; islcp = ((data[0] << 8) + data[1]) == PPP_LCP && 1 <= data[2] && data[2] <= 7; /* compress protocol field */ if ((opt->ppp_flags & SC_COMP_PROT) && data[0] == 0 && !islcp) skb_pull(skb, 1); /* Put in the address/control bytes if necessary */ if ((opt->ppp_flags & SC_COMP_AC) == 0 || islcp) { data = skb_push(skb, 2); data[0] = PPP_ALLSTATIONS; data[1] = PPP_UI; } len = skb->len; seq_recv = opt->seq_recv; if (opt->ack_sent == seq_recv) header_len -= sizeof(hdr->ack); /* Push down and install GRE header */ skb_push(skb, header_len); hdr = (struct pptp_gre_header *)(skb->data); hdr->gre_hd.flags = GRE_KEY | GRE_VERSION_1 | GRE_SEQ; hdr->gre_hd.protocol = GRE_PROTO_PPP; hdr->call_id = htons(opt->dst_addr.call_id); hdr->seq = htonl(++opt->seq_sent); if (opt->ack_sent != seq_recv) { /* send ack with this message */ hdr->gre_hd.flags |= GRE_ACK; hdr->ack = htonl(seq_recv); opt->ack_sent = seq_recv; } hdr->payload_len = htons(len); /* Push down and install the IP header. */ skb_reset_transport_header(skb); skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; if (ip_dont_fragment(sk, &rt->dst)) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; iph->protocol = IPPROTO_GRE; iph->tos = 0; iph->daddr = fl4.daddr; iph->saddr = fl4.saddr; iph->ttl = ip4_dst_hoplimit(&rt->dst); iph->tot_len = htons(skb->len); skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); nf_reset_ct(skb); skb->ip_summed = CHECKSUM_NONE; ip_select_ident(net, skb, NULL); ip_send_check(iph); ip_local_out(net, skb->sk, skb); return 1; tx_error: kfree_skb(skb); return 1; } static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt = &po->proto.pptp; int headersize, payload_len, seq; __u8 *payload; struct pptp_gre_header *header; if (!(sk->sk_state & PPPOX_CONNECTED)) { if (sock_queue_rcv_skb(sk, skb)) goto drop; return NET_RX_SUCCESS; } header = (struct pptp_gre_header *)(skb->data); headersize = sizeof(*header); /* test if acknowledgement present */ if (GRE_IS_ACK(header->gre_hd.flags)) { __u32 ack; if (!pskb_may_pull(skb, headersize)) goto drop; header = (struct pptp_gre_header *)(skb->data); /* ack in different place if S = 0 */ ack = GRE_IS_SEQ(header->gre_hd.flags) ? ntohl(header->ack) : ntohl(header->seq); if (ack > opt->ack_recv) opt->ack_recv = ack; /* also handle sequence number wrap-around */ if (WRAPPED(ack, opt->ack_recv)) opt->ack_recv = ack; } else { headersize -= sizeof(header->ack); } /* test if payload present */ if (!GRE_IS_SEQ(header->gre_hd.flags)) goto drop; payload_len = ntohs(header->payload_len); seq = ntohl(header->seq); /* check for incomplete packet (length smaller than expected) */ if (!pskb_may_pull(skb, headersize + payload_len)) goto drop; payload = skb->data + headersize; /* check for expected sequence number */ if (seq < opt->seq_recv + 1 || WRAPPED(opt->seq_recv, seq)) { if ((payload[0] == PPP_ALLSTATIONS) && (payload[1] == PPP_UI) && (PPP_PROTOCOL(payload) == PPP_LCP) && ((payload[4] == PPP_LCP_ECHOREQ) || (payload[4] == PPP_LCP_ECHOREP))) goto allow_packet; } else { opt->seq_recv = seq; allow_packet: skb_pull(skb, headersize); if (payload[0] == PPP_ALLSTATIONS && payload[1] == PPP_UI) { /* chop off address/control */ if (skb->len < 3) goto drop; skb_pull(skb, 2); } skb->ip_summed = CHECKSUM_NONE; skb_set_network_header(skb, skb->head-skb->data); ppp_input(&po->chan, skb); return NET_RX_SUCCESS; } drop: kfree_skb(skb); return NET_RX_DROP; } static int pptp_rcv(struct sk_buff *skb) { struct pppox_sock *po; struct pptp_gre_header *header; struct iphdr *iph; if (skb->pkt_type != PACKET_HOST) goto drop; if (!pskb_may_pull(skb, 12)) goto drop; iph = ip_hdr(skb); header = (struct pptp_gre_header *)skb->data; if (header->gre_hd.protocol != GRE_PROTO_PPP || /* PPTP-GRE protocol for PPTP */ GRE_IS_CSUM(header->gre_hd.flags) || /* flag CSUM should be clear */ GRE_IS_ROUTING(header->gre_hd.flags) || /* flag ROUTING should be clear */ !GRE_IS_KEY(header->gre_hd.flags) || /* flag KEY should be set */ (header->gre_hd.flags & GRE_FLAGS)) /* flag Recursion Ctrl should be clear */ /* if invalid, discard this packet */ goto drop; po = lookup_chan(ntohs(header->call_id), iph->saddr); if (po) { skb_dst_drop(skb); nf_reset_ct(skb); return sk_receive_skb(sk_pppox(po), skb, 0); } drop: kfree_skb(skb); return NET_RX_DROP; } static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr; struct pppox_sock *po = pppox_sk(sk); int error = 0; if (sockaddr_len < sizeof(struct sockaddr_pppox)) return -EINVAL; lock_sock(sk); if (sk->sk_state & PPPOX_DEAD) { error = -EALREADY; goto out; } if (sk->sk_state & PPPOX_BOUND) { error = -EBUSY; goto out; } if (add_chan(po, &sp->sa_addr.pptp)) error = -EBUSY; else sk->sk_state |= PPPOX_BOUND; out: release_sock(sk); return error; } static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr; struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt = &po->proto.pptp; struct rtable *rt; struct flowi4 fl4; int error = 0; if (sockaddr_len < sizeof(struct sockaddr_pppox)) return -EINVAL; if (sp->sa_protocol != PX_PROTO_PPTP) return -EINVAL; if (lookup_chan_dst(sp->sa_addr.pptp.call_id, sp->sa_addr.pptp.sin_addr.s_addr)) return -EALREADY; lock_sock(sk); /* Check for already bound sockets */ if (sk->sk_state & PPPOX_CONNECTED) { error = -EBUSY; goto end; } /* Check for already disconnected sockets, on attempts to disconnect */ if (sk->sk_state & PPPOX_DEAD) { error = -EALREADY; goto end; } if (!opt->src_addr.sin_addr.s_addr || !sp->sa_addr.pptp.sin_addr.s_addr) { error = -EINVAL; goto end; } po->chan.private = sk; po->chan.ops = &pptp_chan_ops; rt = pptp_route_output(po, &fl4); if (IS_ERR(rt)) { error = -EHOSTUNREACH; goto end; } sk_setup_caps(sk, &rt->dst); po->chan.mtu = dst_mtu(&rt->dst); if (!po->chan.mtu) po->chan.mtu = PPP_MRU; po->chan.mtu -= PPTP_HEADER_OVERHEAD; po->chan.hdrlen = 2 + sizeof(struct pptp_gre_header); error = ppp_register_channel(&po->chan); if (error) { pr_err("PPTP: failed to register PPP channel (%d)\n", error); goto end; } opt->dst_addr = sp->sa_addr.pptp; sk->sk_state |= PPPOX_CONNECTED; end: release_sock(sk); return error; } static int pptp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int len = sizeof(struct sockaddr_pppox); struct sockaddr_pppox sp; memset(&sp.sa_addr, 0, sizeof(sp.sa_addr)); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_PPTP; sp.sa_addr.pptp = pppox_sk(sock->sk)->proto.pptp.src_addr; memcpy(uaddr, &sp, len); return len; } static int pptp_release(struct socket *sock) { struct sock *sk = sock->sk; struct pppox_sock *po; int error = 0; if (!sk) return 0; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { release_sock(sk); return -EBADF; } po = pppox_sk(sk); del_chan(po); synchronize_rcu(); pppox_unbind_sock(sk); sk->sk_state = PPPOX_DEAD; sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return error; } static void pptp_sock_destruct(struct sock *sk) { if (!(sk->sk_state & PPPOX_DEAD)) { del_chan(pppox_sk(sk)); pppox_unbind_sock(sk); } skb_queue_purge(&sk->sk_receive_queue); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); } static int pptp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; struct pppox_sock *po; struct pptp_opt *opt; sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto, kern); if (!sk) goto out; sock_init_data(sock, sk); sock->state = SS_UNCONNECTED; sock->ops = &pptp_ops; sk->sk_backlog_rcv = pptp_rcv_core; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_family = PF_PPPOX; sk->sk_protocol = PX_PROTO_PPTP; sk->sk_destruct = pptp_sock_destruct; po = pppox_sk(sk); opt = &po->proto.pptp; opt->seq_sent = 0; opt->seq_recv = 0xffffffff; opt->ack_recv = 0; opt->ack_sent = 0xffffffff; error = 0; out: return error; } static int pptp_ppp_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg) { struct sock *sk = (struct sock *) chan->private; struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt = &po->proto.pptp; void __user *argp = (void __user *)arg; int __user *p = argp; int err, val; err = -EFAULT; switch (cmd) { case PPPIOCGFLAGS: val = opt->ppp_flags; if (put_user(val, p)) break; err = 0; break; case PPPIOCSFLAGS: if (get_user(val, p)) break; opt->ppp_flags = val & ~SC_RCV_BITS; err = 0; break; default: err = -ENOTTY; } return err; } static const struct ppp_channel_ops pptp_chan_ops = { .start_xmit = pptp_xmit, .ioctl = pptp_ppp_ioctl, }; static struct proto pptp_sk_proto __read_mostly = { .name = "PPTP", .owner = THIS_MODULE, .obj_size = sizeof(struct pppox_sock), }; static const struct proto_ops pptp_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pptp_release, .bind = pptp_bind, .connect = pptp_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pptp_getname, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pppox_compat_ioctl, #endif }; static const struct pppox_proto pppox_pptp_proto = { .create = pptp_create, .owner = THIS_MODULE, }; static const struct gre_protocol gre_pptp_protocol = { .handler = pptp_rcv, }; static int __init pptp_init_module(void) { int err = 0; pr_info("PPTP driver version " PPTP_DRIVER_VERSION "\n"); callid_sock = vzalloc(array_size(sizeof(void *), (MAX_CALLID + 1))); if (!callid_sock) return -ENOMEM; err = gre_add_protocol(&gre_pptp_protocol, GREPROTO_PPTP); if (err) { pr_err("PPTP: can't add gre protocol\n"); goto out_mem_free; } err = proto_register(&pptp_sk_proto, 0); if (err) { pr_err("PPTP: can't register sk_proto\n"); goto out_gre_del_protocol; } err = register_pppox_proto(PX_PROTO_PPTP, &pppox_pptp_proto); if (err) { pr_err("PPTP: can't register pppox_proto\n"); goto out_unregister_sk_proto; } return 0; out_unregister_sk_proto: proto_unregister(&pptp_sk_proto); out_gre_del_protocol: gre_del_protocol(&gre_pptp_protocol, GREPROTO_PPTP); out_mem_free: vfree(callid_sock); return err; } static void __exit pptp_exit_module(void) { unregister_pppox_proto(PX_PROTO_PPTP); proto_unregister(&pptp_sk_proto); gre_del_protocol(&gre_pptp_protocol, GREPROTO_PPTP); vfree(callid_sock); } module_init(pptp_init_module); module_exit(pptp_exit_module); MODULE_DESCRIPTION("Point-to-Point Tunneling Protocol"); MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_PPTP); |
26 33 33 62 26 33 33 67 68 1 26 26 1 67 50 2 16 68 33 67 4 5 5 5 5 5 5 1 5 38 38 38 38 33 19 19 19 19 17 7 37 65 66 66 63 21 14 18 21 66 38 38 38 38 38 11 36 38 38 3 1 3 1 2 2 4 4 4 4 4 4 5 1 1 1 1 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 | // SPDX-License-Identifier: GPL-2.0 #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/tcp.h> #include <linux/hash.h> #include <linux/tcp_metrics.h> #include <linux/vmalloc.h> #include <net/inet_connection_sock.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/dst.h> #include <net/tcp.h> #include <net/genetlink.h> static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, const struct inetpeer_addr *daddr, struct net *net, unsigned int hash); struct tcp_fastopen_metrics { u16 mss; u16 syn_loss:10, /* Recurring Fast Open SYN losses */ try_exp:2; /* Request w/ exp. option (once) */ unsigned long last_syn_loss; /* Last Fast Open SYN loss */ struct tcp_fastopen_cookie cookie; }; /* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility * Kernel only stores RTT and RTTVAR in usec resolution */ #define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2) struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; struct net *tcpm_net; struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; u32 tcpm_lock; u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; struct tcp_fastopen_metrics tcpm_fastopen; struct rcu_head rcu_head; }; static inline struct net *tm_net(const struct tcp_metrics_block *tm) { /* Paired with the WRITE_ONCE() in tcpm_new() */ return READ_ONCE(tm->tcpm_net); } static bool tcp_metric_locked(struct tcp_metrics_block *tm, enum tcp_metric_index idx) { /* Paired with WRITE_ONCE() in tcpm_suck_dst() */ return READ_ONCE(tm->tcpm_lock) & (1 << idx); } static u32 tcp_metric_get(const struct tcp_metrics_block *tm, enum tcp_metric_index idx) { /* Paired with WRITE_ONCE() in tcp_metric_set() */ return READ_ONCE(tm->tcpm_vals[idx]); } static void tcp_metric_set(struct tcp_metrics_block *tm, enum tcp_metric_index idx, u32 val) { /* Paired with READ_ONCE() in tcp_metric_get() */ WRITE_ONCE(tm->tcpm_vals[idx], val); } static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { return (a->family == b->family) && !inetpeer_addr_cmp(a, b); } struct tcpm_hash_bucket { struct tcp_metrics_block __rcu *chain; }; static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly; static unsigned int tcp_metrics_hash_log __read_mostly; static DEFINE_SPINLOCK(tcp_metrics_lock); static DEFINE_SEQLOCK(fastopen_seqlock); static void tcpm_suck_dst(struct tcp_metrics_block *tm, const struct dst_entry *dst, bool fastopen_clear) { u32 msval; u32 val; WRITE_ONCE(tm->tcpm_stamp, jiffies); val = 0; if (dst_metric_locked(dst, RTAX_RTT)) val |= 1 << TCP_METRIC_RTT; if (dst_metric_locked(dst, RTAX_RTTVAR)) val |= 1 << TCP_METRIC_RTTVAR; if (dst_metric_locked(dst, RTAX_SSTHRESH)) val |= 1 << TCP_METRIC_SSTHRESH; if (dst_metric_locked(dst, RTAX_CWND)) val |= 1 << TCP_METRIC_CWND; if (dst_metric_locked(dst, RTAX_REORDERING)) val |= 1 << TCP_METRIC_REORDERING; /* Paired with READ_ONCE() in tcp_metric_locked() */ WRITE_ONCE(tm->tcpm_lock, val); msval = dst_metric_raw(dst, RTAX_RTT); tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC); msval = dst_metric_raw(dst, RTAX_RTTVAR); tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC); tcp_metric_set(tm, TCP_METRIC_SSTHRESH, dst_metric_raw(dst, RTAX_SSTHRESH)); tcp_metric_set(tm, TCP_METRIC_CWND, dst_metric_raw(dst, RTAX_CWND)); tcp_metric_set(tm, TCP_METRIC_REORDERING, dst_metric_raw(dst, RTAX_REORDERING)); if (fastopen_clear) { write_seqlock(&fastopen_seqlock); tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; tm->tcpm_fastopen.try_exp = 0; tm->tcpm_fastopen.cookie.exp = false; tm->tcpm_fastopen.cookie.len = 0; write_sequnlock(&fastopen_seqlock); } } #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) static void tcpm_check_stamp(struct tcp_metrics_block *tm, const struct dst_entry *dst) { unsigned long limit; if (!tm) return; limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT; if (unlikely(time_after(jiffies, limit))) tcpm_suck_dst(tm, dst, false); } #define TCP_METRICS_RECLAIM_DEPTH 5 #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL #define deref_locked(p) \ rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock)) static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, struct inetpeer_addr *saddr, struct inetpeer_addr *daddr, unsigned int hash) { struct tcp_metrics_block *tm; struct net *net; bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); net = dev_net(dst->dev); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. */ tm = __tcp_get_metrics(saddr, daddr, net, hash); if (tm == TCP_METRICS_RECLAIM_PTR) { reclaim = true; tm = NULL; } if (tm) { tcpm_check_stamp(tm, dst); goto out_unlock; } if (unlikely(reclaim)) { struct tcp_metrics_block *oldest; oldest = deref_locked(tcp_metrics_hash[hash].chain); for (tm = deref_locked(oldest->tcpm_next); tm; tm = deref_locked(tm->tcpm_next)) { if (time_before(READ_ONCE(tm->tcpm_stamp), READ_ONCE(oldest->tcpm_stamp))) oldest = tm; } tm = oldest; } else { tm = kzalloc(sizeof(*tm), GFP_ATOMIC); if (!tm) goto out_unlock; } /* Paired with the READ_ONCE() in tm_net() */ WRITE_ONCE(tm->tcpm_net, net); tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; tcpm_suck_dst(tm, dst, reclaim); if (likely(!reclaim)) { tm->tcpm_next = tcp_metrics_hash[hash].chain; rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm); } out_unlock: spin_unlock_bh(&tcp_metrics_lock); return tm; } static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) { if (tm) return tm; if (depth > TCP_METRICS_RECLAIM_DEPTH) return TCP_METRICS_RECLAIM_PTR; return NULL; } static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, const struct inetpeer_addr *daddr, struct net *net, unsigned int hash) { struct tcp_metrics_block *tm; int depth = 0; for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, saddr) && addr_same(&tm->tcpm_daddr, daddr) && net_eq(tm_net(tm), net)) break; depth++; } return tcp_get_encode(tm, depth); } static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; saddr.family = req->rsk_ops->family; daddr.family = req->rsk_ops->family; switch (daddr.family) { case AF_INET: inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr); inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr); hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr); inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr); hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif default: return NULL; } net = dev_net(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && addr_same(&tm->tcpm_daddr, &daddr) && net_eq(tm_net(tm), net)) break; } tcpm_check_stamp(tm, dst); return tm; } static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct dst_entry *dst, bool create) { struct tcp_metrics_block *tm; struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; if (sk->sk_family == AF_INET) { inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } else { inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr); inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr); hash = ipv6_addr_hash(&sk->sk_v6_daddr); } } #endif else return NULL; net = dev_net(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); tm = __tcp_get_metrics(&saddr, &daddr, net, hash); if (tm == TCP_METRICS_RECLAIM_PTR) tm = NULL; if (!tm && create) tm = tcpm_new(dst, &saddr, &daddr, hash); else tcpm_check_stamp(tm, dst); return tm; } /* Save metrics learned by this TCP session. This function is called * only, when TCP finishes successfully i.e. when it enters TIME-WAIT * or goes from LAST-ACK to CLOSE. */ void tcp_update_metrics(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct tcp_metrics_block *tm; unsigned long rtt; u32 val; int m; sk_dst_confirm(sk); if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst) return; rcu_read_lock(); if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. Reset our * results. */ tm = tcp_get_metrics(sk, dst, false); if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) tcp_metric_set(tm, TCP_METRIC_RTT, 0); goto out_unlock; } else tm = tcp_get_metrics(sk, dst, true); if (!tm) goto out_unlock; rtt = tcp_metric_get(tm, TCP_METRIC_RTT); m = rtt - tp->srtt_us; /* If newly calculated rtt larger than stored one, store new * one. Otherwise, use EWMA. Remember, rtt overestimation is * always better than underestimation. */ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { if (m <= 0) rtt = tp->srtt_us; else rtt -= (m >> 3); tcp_metric_set(tm, TCP_METRIC_RTT, rtt); } if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { unsigned long var; if (m < 0) m = -m; /* Scale deviation to rttvar fixed point */ m >>= 1; if (m < tp->mdev_us) m = tp->mdev_us; var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); if (m >= var) var = m; else var -= (var - m) >> 2; tcp_metric_set(tm, TCP_METRIC_RTTVAR, var); } if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && (tcp_snd_cwnd(tp) >> 1) > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, tcp_snd_cwnd(tp) >> 1); } if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); if (tcp_snd_cwnd(tp) > val) tcp_metric_set(tm, TCP_METRIC_CWND, tcp_snd_cwnd(tp)); } } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1); } } else { /* Else slow start did not finish, cwnd is non-sense, * ssthresh may be also invalid. */ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_ssthresh) >> 1); } if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && tp->snd_ssthresh > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, tp->snd_ssthresh); } if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val < tp->reordering && tp->reordering != READ_ONCE(net->ipv4.sysctl_tcp_reordering)) tcp_metric_set(tm, TCP_METRIC_REORDERING, tp->reordering); } } WRITE_ONCE(tm->tcpm_stamp, jiffies); out_unlock: rcu_read_unlock(); } /* Initialize metrics on socket. */ void tcp_init_metrics(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ sk_dst_confirm(sk); /* ssthresh may have been reduced unnecessarily during. * 3WHS. Restore it back to its initial default. */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; if (!dst) goto reset; rcu_read_lock(); tm = tcp_get_metrics(sk, dst, false); if (!tm) { rcu_read_unlock(); goto reset; } if (tcp_metric_locked(tm, TCP_METRIC_CWND)) tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ? 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) { tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; } val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val) tp->reordering = val; crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); reset: /* The initial RTT measurement from the SYN/SYN-ACK is not ideal * to seed the RTO for later data packets because SYN packets are * small. Use the per-dst cached values to seed the RTO but keep * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). * Later the RTO will be updated immediately upon obtaining the first * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only * influences the first RTO but not later RTT estimation. * * But if RTT is not available from the SYN (due to retransmits or * syn cookies) or the cache, force a conservative 3secs timeout. * * A bit of theory. RTT is time passed after "normal" sized packet * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever * tricks sort of "quick acks" for time long enough to decrease RTT * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ crtt /= 8 * USEC_PER_SEC / HZ; inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); } else if (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, * including spurious one. Reset the RTO back to 3secs * from the more aggressive 1sec to avoid more spurious * retransmission. */ tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } } bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; bool ret; if (!dst) return false; rcu_read_lock(); tm = __tcp_get_metrics_req(req, dst); if (tm && tcp_metric_get(tm, TCP_METRIC_RTT)) ret = true; else ret = false; rcu_read_unlock(); return ret; } void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { struct tcp_metrics_block *tm; rcu_read_lock(); tm = tcp_get_metrics(sk, __sk_dst_get(sk), false); if (tm) { struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; unsigned int seq; do { seq = read_seqbegin(&fastopen_seqlock); if (tfom->mss) *mss = tfom->mss; *cookie = tfom->cookie; if (cookie->len <= 0 && tfom->try_exp == 1) cookie->exp = true; } while (read_seqretry(&fastopen_seqlock, seq)); } rcu_read_unlock(); } void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_metrics_block *tm; if (!dst) return; rcu_read_lock(); tm = tcp_get_metrics(sk, dst, true); if (tm) { struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; write_seqlock_bh(&fastopen_seqlock); if (mss) tfom->mss = mss; if (cookie && cookie->len > 0) tfom->cookie = *cookie; else if (try_exp > tfom->try_exp && tfom->cookie.len <= 0 && !tfom->cookie.exp) tfom->try_exp = try_exp; if (syn_lost) { ++tfom->syn_loss; tfom->last_syn_loss = jiffies; } else tfom->syn_loss = 0; write_sequnlock_bh(&fastopen_seqlock); } rcu_read_unlock(); } static struct genl_family tcp_metrics_nl_family; static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, [TCP_METRICS_ATTR_SADDR_IPV4] = { .type = NLA_U32, }, /* Following attributes are not received for GET/DEL, * we keep them for reference */ #if 0 [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, }, [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, }, [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, }, [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, }, [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, }, [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, }, [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, }, [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY, .len = TCP_FASTOPEN_COOKIE_MAX, }, #endif }; /* Add attributes, caller cancels its header on failure */ static int tcp_metrics_fill_info(struct sk_buff *msg, struct tcp_metrics_block *tm) { struct nlattr *nest; int i; switch (tm->tcpm_daddr.family) { case AF_INET: if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4, inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4, inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6, inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6, inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; default: return -EAFNOSUPPORT; } if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, jiffies - READ_ONCE(tm->tcpm_stamp), TCP_METRICS_ATTR_PAD) < 0) goto nla_put_failure; { int n = 0; nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS); if (!nest) goto nla_put_failure; for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { u32 val = tcp_metric_get(tm, i); if (!val) continue; if (i == TCP_METRIC_RTT) { if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1, val) < 0) goto nla_put_failure; n++; val = max(val / 1000, 1U); } if (i == TCP_METRIC_RTTVAR) { if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1, val) < 0) goto nla_put_failure; n++; val = max(val / 1000, 1U); } if (nla_put_u32(msg, i + 1, val) < 0) goto nla_put_failure; n++; } if (n) nla_nest_end(msg, nest); else nla_nest_cancel(msg, nest); } { struct tcp_fastopen_metrics tfom_copy[1], *tfom; unsigned int seq; do { seq = read_seqbegin(&fastopen_seqlock); tfom_copy[0] = tm->tcpm_fastopen; } while (read_seqretry(&fastopen_seqlock, seq)); tfom = tfom_copy; if (tfom->mss && nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS, tfom->mss) < 0) goto nla_put_failure; if (tfom->syn_loss && (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS, tfom->syn_loss) < 0 || nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS, jiffies - tfom->last_syn_loss, TCP_METRICS_ATTR_PAD) < 0)) goto nla_put_failure; if (tfom->cookie.len > 0 && nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE, tfom->cookie.len, tfom->cookie.val) < 0) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int tcp_metrics_dump_info(struct sk_buff *skb, struct netlink_callback *cb, struct tcp_metrics_block *tm) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &tcp_metrics_nl_family, NLM_F_MULTI, TCP_METRICS_CMD_GET); if (!hdr) return -EMSGSIZE; if (tcp_metrics_fill_info(skb, tm) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int tcp_metrics_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); unsigned int max_rows = 1U << tcp_metrics_hash_log; unsigned int row, s_row = cb->args[0]; int s_col = cb->args[1], col = s_col; for (row = s_row; row < max_rows; row++, s_col = 0) { struct tcp_metrics_block *tm; struct tcpm_hash_bucket *hb = tcp_metrics_hash + row; rcu_read_lock(); for (col = 0, tm = rcu_dereference(hb->chain); tm; tm = rcu_dereference(tm->tcpm_next), col++) { if (!net_eq(tm_net(tm), net)) continue; if (col < s_col) continue; if (tcp_metrics_dump_info(skb, cb, tm) < 0) { rcu_read_unlock(); goto done; } } rcu_read_unlock(); } done: cb->args[0] = row; cb->args[1] = col; return skb->len; } static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, unsigned int *hash, int optional, int v4, int v6) { struct nlattr *a; a = info->attrs[v4]; if (a) { inetpeer_set_addr_v4(addr, nla_get_in_addr(a)); if (hash) *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr)); return 0; } a = info->attrs[v6]; if (a) { struct in6_addr in6; if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; in6 = nla_get_in6_addr(a); inetpeer_set_addr_v6(addr, &in6); if (hash) *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr)); return 0; } return optional ? 1 : -EAFNOSUPPORT; } static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, unsigned int *hash, int optional) { return __parse_nl_addr(info, addr, hash, optional, TCP_METRICS_ATTR_ADDR_IPV4, TCP_METRICS_ATTR_ADDR_IPV6); } static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr) { return __parse_nl_addr(info, addr, NULL, 0, TCP_METRICS_ATTR_SADDR_IPV4, TCP_METRICS_ATTR_SADDR_IPV6); } static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct tcp_metrics_block *tm; struct inetpeer_addr saddr, daddr; unsigned int hash; struct sk_buff *msg; struct net *net = genl_info_net(info); void *reply; int ret; bool src = true; ret = parse_nl_addr(info, &daddr, &hash, 0); if (ret < 0) return ret; ret = parse_nl_saddr(info, &saddr); if (ret < 0) src = false; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0, info->genlhdr->cmd); if (!reply) goto nla_put_failure; hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); ret = -ESRCH; rcu_read_lock(); for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr)) && net_eq(tm_net(tm), net)) { ret = tcp_metrics_fill_info(msg, tm); break; } } rcu_read_unlock(); if (ret < 0) goto out_free; genlmsg_end(msg, reply); return genlmsg_reply(msg, info); nla_put_failure: ret = -EMSGSIZE; out_free: nlmsg_free(msg); return ret; } static void tcp_metrics_flush_all(struct net *net) { unsigned int max_rows = 1U << tcp_metrics_hash_log; struct tcpm_hash_bucket *hb = tcp_metrics_hash; struct tcp_metrics_block *tm; unsigned int row; for (row = 0; row < max_rows; row++, hb++) { struct tcp_metrics_block __rcu **pp; bool match; spin_lock_bh(&tcp_metrics_lock); pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : !refcount_read(&tm_net(tm)->ns.count); if (match) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); } else { pp = &tm->tcpm_next; } } spin_unlock_bh(&tcp_metrics_lock); } } static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct tcpm_hash_bucket *hb; struct tcp_metrics_block *tm; struct tcp_metrics_block __rcu **pp; struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net = genl_info_net(info); int ret; bool src = true, found = false; ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) return ret; if (ret > 0) { tcp_metrics_flush_all(net); return 0; } ret = parse_nl_saddr(info, &saddr); if (ret < 0) src = false; hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); hb = tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr)) && net_eq(tm_net(tm), net)) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); found = true; } else { pp = &tm->tcpm_next; } } spin_unlock_bh(&tcp_metrics_lock); if (!found) return -ESRCH; return 0; } static const struct genl_small_ops tcp_metrics_nl_ops[] = { { .cmd = TCP_METRICS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tcp_metrics_nl_cmd_get, .dumpit = tcp_metrics_nl_dump, }, { .cmd = TCP_METRICS_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tcp_metrics_nl_cmd_del, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family tcp_metrics_nl_family __ro_after_init = { .hdrsize = 0, .name = TCP_METRICS_GENL_NAME, .version = TCP_METRICS_GENL_VERSION, .maxattr = TCP_METRICS_ATTR_MAX, .policy = tcp_metrics_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = tcp_metrics_nl_ops, .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops), }; static unsigned int tcpmhash_entries; static int __init set_tcpmhash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtouint(str, 0, &tcpmhash_entries); if (ret) return 0; return 1; } __setup("tcpmhash_entries=", set_tcpmhash_entries); static int __net_init tcp_net_metrics_init(struct net *net) { size_t size; unsigned int slots; if (!net_eq(net, &init_net)) return 0; slots = tcpmhash_entries; if (!slots) { if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; else slots = 8 * 1024; } tcp_metrics_hash_log = order_base_2(slots); size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log; tcp_metrics_hash = kvzalloc(size, GFP_KERNEL); if (!tcp_metrics_hash) return -ENOMEM; return 0; } static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) { tcp_metrics_flush_all(NULL); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { .init = tcp_net_metrics_init, .exit_batch = tcp_net_metrics_exit_batch, }; void __init tcp_metrics_init(void) { int ret; ret = register_pernet_subsys(&tcp_net_metrics_ops); if (ret < 0) panic("Could not allocate the tcp_metrics hash table\n"); ret = genl_register_family(&tcp_metrics_nl_family); if (ret < 0) panic("Could not register tcp_metrics generic netlink\n"); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Landlock LSM - Ruleset management * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #ifndef _SECURITY_LANDLOCK_RULESET_H #define _SECURITY_LANDLOCK_RULESET_H #include <linux/bitops.h> #include <linux/build_bug.h> #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include "limits.h" #include "object.h" typedef u16 access_mask_t; /* Makes sure all filesystem access rights can be stored. */ static_assert(BITS_PER_TYPE(access_mask_t) >= LANDLOCK_NUM_ACCESS_FS); /* Makes sure for_each_set_bit() and for_each_clear_bit() calls are OK. */ static_assert(sizeof(unsigned long) >= sizeof(access_mask_t)); typedef u16 layer_mask_t; /* Makes sure all layers can be checked. */ static_assert(BITS_PER_TYPE(layer_mask_t) >= LANDLOCK_MAX_NUM_LAYERS); /** * struct landlock_layer - Access rights for a given layer */ struct landlock_layer { /** * @level: Position of this layer in the layer stack. */ u16 level; /** * @access: Bitfield of allowed actions on the kernel object. They are * relative to the object type (e.g. %LANDLOCK_ACTION_FS_READ). */ access_mask_t access; }; /** * struct landlock_rule - Access rights tied to an object */ struct landlock_rule { /** * @node: Node in the ruleset's red-black tree. */ struct rb_node node; /** * @object: Pointer to identify a kernel object (e.g. an inode). This * is used as a key for this ruleset element. This pointer is set once * and never modified. It always points to an allocated object because * each rule increments the refcount of its object. */ struct landlock_object *object; /** * @num_layers: Number of entries in @layers. */ u32 num_layers; /** * @layers: Stack of layers, from the latest to the newest, implemented * as a flexible array member (FAM). */ struct landlock_layer layers[]; }; /** * struct landlock_hierarchy - Node in a ruleset hierarchy */ struct landlock_hierarchy { /** * @parent: Pointer to the parent node, or NULL if it is a root * Landlock domain. */ struct landlock_hierarchy *parent; /** * @usage: Number of potential children domains plus their parent * domain. */ refcount_t usage; }; /** * struct landlock_ruleset - Landlock ruleset * * This data structure must contain unique entries, be updatable, and quick to * match an object. */ struct landlock_ruleset { /** * @root: Root of a red-black tree containing &struct landlock_rule * nodes. Once a ruleset is tied to a process (i.e. as a domain), this * tree is immutable until @usage reaches zero. */ struct rb_root root; /** * @hierarchy: Enables hierarchy identification even when a parent * domain vanishes. This is needed for the ptrace protection. */ struct landlock_hierarchy *hierarchy; union { /** * @work_free: Enables to free a ruleset within a lockless * section. This is only used by * landlock_put_ruleset_deferred() when @usage reaches zero. * The fields @lock, @usage, @num_rules, @num_layers and * @fs_access_masks are then unused. */ struct work_struct work_free; struct { /** * @lock: Protects against concurrent modifications of * @root, if @usage is greater than zero. */ struct mutex lock; /** * @usage: Number of processes (i.e. domains) or file * descriptors referencing this ruleset. */ refcount_t usage; /** * @num_rules: Number of non-overlapping (i.e. not for * the same object) rules in this ruleset. */ u32 num_rules; /** * @num_layers: Number of layers that are used in this * ruleset. This enables to check that all the layers * allow an access request. A value of 0 identifies a * non-merged ruleset (i.e. not a domain). */ u32 num_layers; /** * @fs_access_masks: Contains the subset of filesystem * actions that are restricted by a ruleset. A domain * saves all layers of merged rulesets in a stack * (FAM), starting from the first layer to the last * one. These layers are used when merging rulesets, * for user space backward compatibility (i.e. * future-proof), and to properly handle merged * rulesets without overlapping access rights. These * layers are set once and never changed for the * lifetime of the ruleset. */ access_mask_t fs_access_masks[]; }; }; }; struct landlock_ruleset * landlock_create_ruleset(const access_mask_t fs_access_mask); void landlock_put_ruleset(struct landlock_ruleset *const ruleset); void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset); int landlock_insert_rule(struct landlock_ruleset *const ruleset, struct landlock_object *const object, const access_mask_t access); struct landlock_ruleset * landlock_merge_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const ruleset); const struct landlock_rule * landlock_find_rule(const struct landlock_ruleset *const ruleset, const struct landlock_object *const object); static inline void landlock_get_ruleset(struct landlock_ruleset *const ruleset) { if (ruleset) refcount_inc(&ruleset->usage); } #endif /* _SECURITY_LANDLOCK_RULESET_H */ |
110 111 111 111 111 111 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 | // SPDX-License-Identifier: GPL-2.0 /* * Devices PM QoS constraints management * * Copyright (C) 2011 Texas Instruments, Inc. * * This module exposes the interface to kernel space for specifying * per-device PM QoS dependencies. It provides infrastructure for registration * of: * * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. * Watchers register to keep track of the current QoS needs of the system. * Watchers can register a per-device notification callback using the * dev_pm_qos_*_notifier API. The notification chain data is stored in the * per-device constraint data struct. * * Note about the per-device constraint data struct allocation: * . The per-device constraints data struct ptr is stored into the device * dev_pm_info. * . To minimize the data usage by the per-device constraints, the data struct * is only allocated at the first call to dev_pm_qos_add_request. * . The data is later free'd when the device is removed from the system. * . A global mutex protects the constraints users from the data being * allocated and free'd. */ #include <linux/pm_qos.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/err.h> #include <trace/events/power.h> #include "power.h" static DEFINE_MUTEX(dev_pm_qos_mtx); static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx); /** * __dev_pm_qos_flags - Check PM QoS flags for a given device. * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. * * This routine must be called with dev->power.lock held. */ enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { struct dev_pm_qos *qos = dev->power.qos; struct pm_qos_flags *pqf; s32 val; lockdep_assert_held(&dev->power.lock); if (IS_ERR_OR_NULL(qos)) return PM_QOS_FLAGS_UNDEFINED; pqf = &qos->flags; if (list_empty(&pqf->list)) return PM_QOS_FLAGS_UNDEFINED; val = pqf->effective_flags & mask; if (val) return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME; return PM_QOS_FLAGS_NONE; } /** * dev_pm_qos_flags - Check PM QoS flags for a given device (locked). * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. */ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { unsigned long irqflags; enum pm_qos_flags_status ret; spin_lock_irqsave(&dev->power.lock, irqflags); ret = __dev_pm_qos_flags(dev, mask); spin_unlock_irqrestore(&dev->power.lock, irqflags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_flags); /** * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device. * @dev: Device to get the PM QoS constraint value for. * * This routine must be called with dev->power.lock held. */ s32 __dev_pm_qos_resume_latency(struct device *dev) { lockdep_assert_held(&dev->power.lock); return dev_pm_qos_raw_resume_latency(dev); } /** * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked). * @dev: Device to get the PM QoS constraint value for. * @type: QoS request type. */ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos *qos = dev->power.qos; unsigned long flags; s32 ret; spin_lock_irqsave(&dev->power.lock, flags); switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); break; default: WARN_ON(1); ret = 0; } spin_unlock_irqrestore(&dev->power.lock, flags); return ret; } /** * apply_constraint - Add/modify/remove device PM QoS request. * @req: Constraint request to apply * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * Internal function to update the constraints list using the PM QoS core * code and if needed call the per-device callbacks. */ static int apply_constraint(struct dev_pm_qos_request *req, enum pm_qos_req_action action, s32 value) { struct dev_pm_qos *qos = req->dev->power.qos; int ret; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0)) value = 0; ret = pm_qos_update_target(&qos->resume_latency, &req->data.pnode, action, value); break; case DEV_PM_QOS_LATENCY_TOLERANCE: ret = pm_qos_update_target(&qos->latency_tolerance, &req->data.pnode, action, value); if (ret) { value = pm_qos_read_value(&qos->latency_tolerance); req->dev->power.set_latency_tolerance(req->dev, value); } break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_apply(&req->data.freq, action, value); break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); break; default: ret = -EINVAL; } return ret; } /* * dev_pm_qos_constraints_allocate * @dev: device to allocate data for * * Called at the first call to add_request, for constraint data allocation * Must be called with the dev_pm_qos_mtx mutex held */ static int dev_pm_qos_constraints_allocate(struct device *dev) { struct dev_pm_qos *qos; struct pm_qos_constraints *c; struct blocking_notifier_head *n; qos = kzalloc(sizeof(*qos), GFP_KERNEL); if (!qos) return -ENOMEM; n = kzalloc(3 * sizeof(*n), GFP_KERNEL); if (!n) { kfree(qos); return -ENOMEM; } c = &qos->resume_latency; plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; BLOCKING_INIT_NOTIFIER_HEAD(n); c = &qos->latency_tolerance; plist_head_init(&c->list); c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; freq_constraints_init(&qos->freq); INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); dev->power.qos = qos; spin_unlock_irq(&dev->power.lock); return 0; } static void __dev_pm_qos_hide_latency_limit(struct device *dev); static void __dev_pm_qos_hide_flags(struct device *dev); /** * dev_pm_qos_constraints_destroy * @dev: target device * * Called from the device PM subsystem on device removal under device_pm_lock(). */ void dev_pm_qos_constraints_destroy(struct device *dev) { struct dev_pm_qos *qos; struct dev_pm_qos_request *req, *tmp; struct pm_qos_constraints *c; struct pm_qos_flags *f; mutex_lock(&dev_pm_qos_sysfs_mtx); /* * If the device's PM QoS resume latency limit or PM QoS flags have been * exposed to user space, they have to be hidden at this point. */ pm_qos_sysfs_remove_resume_latency(dev); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); __dev_pm_qos_hide_flags(dev); qos = dev->power.qos; if (!qos) goto out; /* Flush the constraints lists for the device. */ c = &qos->resume_latency; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { /* * Update constraints list and call the notification * callbacks if needed */ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->latency_tolerance; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.min_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.max_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } spin_lock_irq(&dev->power.lock); dev->power.qos = ERR_PTR(-ENODEV); spin_unlock_irq(&dev->power.lock); kfree(qos->resume_latency.notifiers); kfree(qos); out: mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } static bool dev_pm_qos_invalid_req_type(struct device *dev, enum dev_pm_qos_req_type type) { return type == DEV_PM_QOS_LATENCY_TOLERANCE && !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret = 0; if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), "%s() called for already added request\n", __func__)) return -EINVAL; if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); if (ret) return ret; req->dev = dev; req->type = type; if (req->type == DEV_PM_QOS_MIN_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MIN, value); else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MAX, value); else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); return ret; } /** * dev_pm_qos_add_request - inserts new qos request into the list * @dev: target device for the constraint * @req: pointer to a preallocated handle * @type: type of the request * @value: defines the qos request * * This function inserts a new entry in the device constraints list of * requested qos performance characteristics. It recomputes the aggregate * QoS expectations of parameters and initializes the dev_pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory * to allocate for data structures, -ENODEV if the device has just been removed * from the system. * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_add_request(dev, req, type, value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_request); /** * __dev_pm_qos_update_request - Modify an existing device PM QoS request. * @req : PM QoS request to modify. * @new_value: New value to request. */ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { s32 curr_value; int ret = 0; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: curr_value = req->data.freq.pnode.prio; break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; default: return -EINVAL; } trace_dev_pm_qos_update_request(dev_name(req->dev), req->type, new_value); if (curr_value != new_value) ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value); return ret; } /** * dev_pm_qos_update_request - modifies an existing qos request * @req : handle to list element holding a dev_pm_qos request to use * @new_value: defines the qos request * * Updates an existing dev PM qos request along with updating the * target value. * * Attempts are made to make this code callable on hot code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_update_request(req, new_value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_request); static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type, PM_QOS_DEFAULT_VALUE); ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); return ret; } /** * dev_pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * * Will remove pm qos request from the list of constraints and * recompute the current target value. Call this on slow code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_remove_request(req); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request); /** * dev_pm_qos_add_notifier - sets notification entry for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block managed by caller. * @type: request type. * * Will register the notifier into a notification chain that gets called * upon changes to the target value for the device. * * If the device's constraints object doesn't exist when this routine is called, * it will be created (or error code will be returned if that fails). */ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); if (ret) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier); /** * dev_pm_qos_remove_notifier - deletes notification for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block to be removed. * @type: request type. * * Will remove the notifier from the notification chain that gets called * upon changes to the target value. */ int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); /* Silently return if the constraints object is not present. */ if (IS_ERR_OR_NULL(dev->power.qos)) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier); /** * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor. * @dev: Device whose ancestor to add the request for. * @req: Pointer to the preallocated handle. * @type: Type of the request. * @value: Constraint latency value. */ int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { struct device *ancestor = dev->parent; int ret = -ENODEV; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: while (ancestor && !ancestor->power.ignore_children) ancestor = ancestor->parent; break; case DEV_PM_QOS_LATENCY_TOLERANCE: while (ancestor && !ancestor->power.set_latency_tolerance) ancestor = ancestor->parent; break; default: ancestor = NULL; } if (ancestor) ret = dev_pm_qos_add_request(ancestor, req, type, value); if (ret < 0) req->dev = NULL; return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request); static void __dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos_request *req = NULL; switch(type) { case DEV_PM_QOS_RESUME_LATENCY: req = dev->power.qos->resume_latency_req; dev->power.qos->resume_latency_req = NULL; break; case DEV_PM_QOS_LATENCY_TOLERANCE: req = dev->power.qos->latency_tolerance_req; dev->power.qos->latency_tolerance_req = NULL; break; case DEV_PM_QOS_FLAGS: req = dev->power.qos->flags_req; dev->power.qos->flags_req = NULL; break; default: WARN_ON(1); return; } __dev_pm_qos_remove_request(req); kfree(req); } static void dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_drop_user_request(dev, type); mutex_unlock(&dev_pm_qos_mtx); } /** * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space. * @dev: Device whose PM QoS latency limit is to be exposed to user space. * @value: Initial value of the latency limit. */ int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev) || value < 0) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value); if (ret < 0) { kfree(req); return ret; } mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->resume_latency_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->resume_latency_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_resume_latency(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit); static void __dev_pm_qos_hide_latency_limit(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); } /** * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space. * @dev: Device whose PM QoS latency limit is to be hidden from user space. */ void dev_pm_qos_hide_latency_limit(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_resume_latency(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit); /** * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space. * @dev: Device whose PM QoS flags are to be exposed to user space. * @val: Initial values of the flags. */ int dev_pm_qos_expose_flags(struct device *dev, s32 val) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev)) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val); if (ret < 0) { kfree(req); return ret; } pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->flags_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->flags_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_flags(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags); static void __dev_pm_qos_hide_flags(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); } /** * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space. * @dev: Device whose PM QoS flags are to be hidden from user space. */ void dev_pm_qos_hide_flags(struct device *dev) { pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_flags(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags); /** * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space. * @dev: Device to update the PM QoS flags request for. * @mask: Flags to set/clear. * @set: Whether to set or clear the flags (true means set). */ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set) { s32 value; int ret; pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) { ret = -EINVAL; goto out; } value = dev_pm_qos_requested_flags(dev); if (set) value |= mask; else value &= ~mask; ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value); out: mutex_unlock(&dev_pm_qos_mtx); pm_runtime_put(dev); return ret; } /** * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance. * @dev: Device to obtain the user space latency tolerance for. */ s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { s32 ret; mutex_lock(&dev_pm_qos_mtx); ret = IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req ? PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT : dev->power.qos->latency_tolerance_req->data.pnode.prio; mutex_unlock(&dev_pm_qos_mtx); return ret; } /** * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance. * @dev: Device to update the user space latency tolerance for. * @val: New user space latency tolerance for @dev (negative values disable). */ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { int ret; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req) { struct dev_pm_qos_request *req; if (val < 0) { if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) ret = 0; else ret = -EINVAL; goto out; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { ret = -ENOMEM; goto out; } ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val); if (ret < 0) { kfree(req); goto out; } dev->power.qos->latency_tolerance_req = req; } else { if (val < 0) { __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE); ret = 0; } else { ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val); } } out: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace * @dev: Device whose latency tolerance to expose */ int dev_pm_qos_expose_latency_tolerance(struct device *dev) { int ret; if (!dev->power.set_latency_tolerance) return -EINVAL; mutex_lock(&dev_pm_qos_sysfs_mtx); ret = pm_qos_sysfs_add_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance); /** * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace * @dev: Device whose latency tolerance to hide */ void dev_pm_qos_hide_latency_tolerance(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); /* Remove the request from user space now */ pm_runtime_get_sync(dev); dev_pm_qos_update_user_latency_tolerance(dev, PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance); |
50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/dsfield.h> #include <net/pkt_cls.h> #include <linux/tc_act/tc_skbedit.h> #include <net/tc_act/tc_skbedit.h> static unsigned int skbedit_net_id; static struct tc_action_ops act_skbedit_ops; static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; int action; tcf_lastuse_update(&d->tcf_tm); bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); action = READ_ONCE(d->tcf_action); if (params->flags & SKBEDIT_F_PRIORITY) skb->priority = params->priority; if (params->flags & SKBEDIT_F_INHERITDSFIELD) { int wlen = skb_network_offset(skb); switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) goto err; skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2; break; case htons(ETH_P_IPV6): wlen += sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen)) goto err; skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; break; } } if (params->flags & SKBEDIT_F_QUEUE_MAPPING && skb->dev->real_num_tx_queues > params->queue_mapping) skb_set_queue_mapping(skb, params->queue_mapping); if (params->flags & SKBEDIT_F_MARK) { skb->mark &= ~params->mask; skb->mark |= params->mark & params->mask; } if (params->flags & SKBEDIT_F_PTYPE) skb->pkt_type = params->ptype; return action; err: qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); return TC_ACT_SHOT; } static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_t *tm = &d->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) }, [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct tcf_skbedit_params *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_skbedit *parm; struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL; u16 *queue_mapping = NULL, *ptype = NULL; bool exists = false; int ret = 0, err; u32 index; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy, NULL); if (err < 0) return err; if (tb[TCA_SKBEDIT_PARMS] == NULL) return -EINVAL; if (tb[TCA_SKBEDIT_PRIORITY] != NULL) { flags |= SKBEDIT_F_PRIORITY; priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]); } if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) { flags |= SKBEDIT_F_QUEUE_MAPPING; queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); } if (tb[TCA_SKBEDIT_PTYPE] != NULL) { ptype = nla_data(tb[TCA_SKBEDIT_PTYPE]); if (!skb_pkt_type_ok(*ptype)) return -EINVAL; flags |= SKBEDIT_F_PTYPE; } if (tb[TCA_SKBEDIT_MARK] != NULL) { flags |= SKBEDIT_F_MARK; mark = nla_data(tb[TCA_SKBEDIT_MARK]); } if (tb[TCA_SKBEDIT_MASK] != NULL) { flags |= SKBEDIT_F_MASK; mask = nla_data(tb[TCA_SKBEDIT_MASK]); } if (tb[TCA_SKBEDIT_FLAGS] != NULL) { u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]); if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) flags |= SKBEDIT_F_INHERITDSFIELD; } parm = nla_data(tb[TCA_SKBEDIT_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return 0; if (!flags) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_skbedit_ops, bind, true, act_flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } d = to_skbedit(*a); ret = ACT_P_CREATED; } else { d = to_skbedit(*a); if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { err = -ENOMEM; goto put_chain; } params_new->flags = flags; if (flags & SKBEDIT_F_PRIORITY) params_new->priority = *priority; if (flags & SKBEDIT_F_QUEUE_MAPPING) params_new->queue_mapping = *queue_mapping; if (flags & SKBEDIT_F_MARK) params_new->mark = *mark; if (flags & SKBEDIT_F_PTYPE) params_new->ptype = *ptype; /* default behaviour is to use all the bits */ params_new->mask = 0xffffffff; if (flags & SKBEDIT_F_MASK) params_new->mask = *mask; spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(d->params, params_new, lockdep_is_held(&d->tcf_lock)); spin_unlock_bh(&d->tcf_lock); if (params_new) kfree_rcu(params_new, rcu); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, }; u64 pure_flags = 0; struct tcf_t t; spin_lock_bh(&d->tcf_lock); params = rcu_dereference_protected(d->params, lockdep_is_held(&d->tcf_lock)); opt.action = d->tcf_action; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_PRIORITY) && nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) && nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_MARK) && nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_PTYPE) && nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_MASK) && nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask)) goto nla_put_failure; if (params->flags & SKBEDIT_F_INHERITDSFIELD) pure_flags |= SKBEDIT_F_INHERITDSFIELD; if (pure_flags != 0 && nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; spin_unlock_bh(&d->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_skbedit_cleanup(struct tc_action *a) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; params = rcu_dereference_protected(d->params, 1); if (params) kfree_rcu(params, rcu); } static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); return tcf_idr_search(tn, a, index); } static size_t tcf_skbedit_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_skbedit)) + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_PRIORITY */ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING */ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MARK */ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_PTYPE */ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MASK */ + nla_total_size_64bit(sizeof(u64)); /* TCA_SKBEDIT_FLAGS */ } static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .id = TCA_ID_SKBEDIT, .owner = THIS_MODULE, .act = tcf_skbedit_act, .stats_update = tcf_skbedit_stats_update, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, .cleanup = tcf_skbedit_cleanup, .walk = tcf_skbedit_walker, .get_fill_size = tcf_skbedit_get_fill_size, .lookup = tcf_skbedit_search, .size = sizeof(struct tcf_skbedit), }; static __net_init int skbedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); return tc_action_net_init(net, tn, &act_skbedit_ops); } static void __net_exit skbedit_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, skbedit_net_id); } static struct pernet_operations skbedit_net_ops = { .init = skbedit_init_net, .exit_batch = skbedit_exit_net, .id = &skbedit_net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); MODULE_DESCRIPTION("SKB Editing"); MODULE_LICENSE("GPL"); static int __init skbedit_init_module(void) { return tcf_register_action(&act_skbedit_ops, &skbedit_net_ops); } static void __exit skbedit_cleanup_module(void) { tcf_unregister_action(&act_skbedit_ops, &skbedit_net_ops); } module_init(skbedit_init_module); module_exit(skbedit_cleanup_module); |
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 | // SPDX-License-Identifier: GPL-2.0 /* * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, * for the blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/elevator.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/sbitmap.h> #include <trace/events/block.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" /* * See Documentation/block/deadline-iosched.rst */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ enum dd_data_dir { DD_READ = READ, DD_WRITE = WRITE, }; enum { DD_DIR_COUNT = 2 }; enum dd_prio { DD_RT_PRIO = 0, DD_BE_PRIO = 1, DD_IDLE_PRIO = 2, DD_PRIO_MAX = 2, }; enum { DD_PRIO_COUNT = 3 }; /* I/O statistics per I/O priority. */ struct io_stats_per_prio { local_t inserted; local_t merged; local_t dispatched; local_t completed; }; /* I/O statistics for all I/O priorities (enum dd_prio). */ struct io_stats { struct io_stats_per_prio stats[DD_PRIO_COUNT]; }; /* * Deadline scheduler data per I/O priority (enum dd_prio). Requests are * present on both sort_list[] and fifo_list[]. */ struct dd_per_prio { struct list_head dispatch; struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; /* Next request in FIFO order. Read, write or both are NULL. */ struct request *next_rq[DD_DIR_COUNT]; }; struct deadline_data { /* * run time data */ struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ enum dd_data_dir last_dir; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ struct io_stats __percpu *stats; /* * settings that change how the i/o scheduler behaves */ int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; u32 async_depth; spinlock_t lock; spinlock_t zone_lock; }; /* Count one event of type 'event_type' and with I/O priority 'prio' */ #define dd_count(dd, event_type, prio) do { \ struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \ \ BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ local_inc(&io_stats->stats[(prio)].event_type); \ put_cpu_ptr(io_stats); \ } while (0) /* * Returns the total number of dd_count(dd, event_type, prio) calls across all * CPUs. No locking or barriers since it is fine if the returned sum is slightly * outdated. */ #define dd_sum(dd, event_type, prio) ({ \ unsigned int cpu; \ u32 sum = 0; \ \ BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ for_each_present_cpu(cpu) \ sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \ stats[(prio)].event_type); \ sum; \ }) /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, [IOPRIO_CLASS_RT] = DD_RT_PRIO, [IOPRIO_CLASS_BE] = DD_BE_PRIO, [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { return &per_prio->sort_list[rq_data_dir(rq)]; } /* * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a * request. */ static u8 dd_rq_ioclass(struct request *rq) { return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } /* * get the request before `rq' in sector-sorted order */ static inline struct request * deadline_earlier_request(struct request *rq) { struct rb_node *node = rb_prev(&rq->rb_node); if (node) return rb_entry_rq(node); return NULL; } /* * get the request after `rq' in sector-sorted order */ static inline struct request * deadline_latter_request(struct request *rq) { struct rb_node *node = rb_next(&rq->rb_node); if (node) return rb_entry_rq(node); return NULL; } static void deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { const enum dd_data_dir data_dir = rq_data_dir(rq); if (per_prio->next_rq[data_dir] == rq) per_prio->next_rq[data_dir] = deadline_latter_request(rq); elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ static void deadline_remove_request(struct request_queue *q, struct dd_per_prio *per_prio, struct request *rq) { list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) q->last_merge = NULL; } static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(req); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { elv_rb_del(deadline_rb_root(per_prio, req), req); deadline_add_rq_rb(per_prio, req); } } /* * Callback function that is invoked after @next has been merged into @req. */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, merged, prio); /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { if (time_before((unsigned long)next->fifo_time, (unsigned long)req->fifo_time)) { list_move(&req->queuelist, &next->queuelist); req->fifo_time = next->fifo_time; } } /* * kill knowledge of next, this one is a goner */ deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq) { const enum dd_data_dir data_dir = rq_data_dir(rq); per_prio->next_rq[data_dir] = deadline_latter_request(rq); /* * take it off the sort and fifo list */ deadline_remove_request(rq->q, per_prio, rq); } /* * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ static inline int deadline_check_fifo(struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); /* * rq is expired! */ if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) return 1; return 0; } /* * Check if rq has a sequential request preceding it. */ static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq) { struct request *prev = deadline_earlier_request(rq); if (!prev) return false; return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); } /* * Skip all write requests that are sequential from @rq, even if we cross * a zone boundary. */ static struct request *deadline_skip_seq_writes(struct deadline_data *dd, struct request *rq) { sector_t pos = blk_rq_pos(rq); sector_t skipped_sectors = 0; while (rq) { if (blk_rq_pos(rq) != pos + skipped_sectors) break; skipped_sectors += blk_rq_sectors(rq); rq = deadline_latter_request(rq); } return rq; } /* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. */ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; /* * Look for a write request that can be dispatched, that is one with * an unlocked target zone. For some HDDs, breaking a sequential * write stream can lead to lower throughput, so make sure to preserve * sequential write streams, even if that stream crosses into the next * zones and these zones are unlocked. */ spin_lock_irqsave(&dd->zone_lock, flags); list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { if (blk_req_can_dispatch_to_zone(rq) && (blk_queue_nonrot(rq->q) || !deadline_is_seq_write(dd, rq))) goto out; } rq = NULL; out: spin_unlock_irqrestore(&dd->zone_lock, flags); return rq; } /* * For the specified data direction, return the next request to * dispatch using sector position sorted lists. */ static struct request * deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; rq = per_prio->next_rq[data_dir]; if (!rq) return NULL; if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; /* * Look for a write request that can be dispatched, that is one with * an unlocked target zone. For some HDDs, breaking a sequential * write stream can lead to lower throughput, so make sure to preserve * sequential write streams, even if that stream crosses into the next * zones and these zones are unlocked. */ spin_lock_irqsave(&dd->zone_lock, flags); while (rq) { if (blk_req_can_dispatch_to_zone(rq)) break; if (blk_queue_nonrot(rq->q)) rq = deadline_latter_request(rq); else rq = deadline_skip_seq_writes(dd, rq); } spin_unlock_irqrestore(&dd->zone_lock, flags); return rq; } /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc */ static struct request *__dd_dispatch_request(struct deadline_data *dd, struct dd_per_prio *per_prio) { struct request *rq, *next_rq; enum dd_data_dir data_dir; enum dd_prio prio; u8 ioprio_class; lockdep_assert_held(&dd->lock); if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); list_del_init(&rq->queuelist); goto done; } /* * batches are currently reads XOR writes */ rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ goto dispatch_request; /* * at this point we are not running a batch. select the appropriate * data direction (read / write) */ if (!list_empty(&per_prio->fifo_list[DD_READ])) { BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = DD_READ; goto dispatch_find_request; } /* * there are either no reads or writes have been starved */ if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; data_dir = DD_WRITE; goto dispatch_find_request; } return NULL; dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ next_rq = deadline_next_request(dd, per_prio, data_dir); if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ rq = next_rq; } /* * For a zoned block device, if we only have writes queued and none of * them can be dispatched, rq will be NULL. */ if (!rq) return NULL; dd->last_dir = data_dir; dd->batching = 0; dispatch_request: /* * rq is the selected appropriate request. */ dd->batching++; deadline_move_request(dd, per_prio, rq); done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, dispatched, prio); /* * If the request needs its target zone locked, do it. */ blk_req_zone_write_lock(rq); rq->rq_flags |= RQF_STARTED; return rq; } /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; enum dd_prio prio; spin_lock(&dd->lock); for (prio = 0; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); if (rq) break; } spin_unlock(&dd->lock); return rq; } /* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { struct deadline_data *dd = data->q->elevator->elevator_data; /* Do not throttle synchronous reads. */ if (op_is_sync(op) && !op_is_write(op)) return; /* * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ data->shallow_depth = dd->async_depth; } /* Called by blk_mq_update_nr_requests(). */ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; dd->async_depth = max(1UL, 3 * q->nr_requests / 4); sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { dd_depth_updated(hctx); return 0; } static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); } free_percpu(dd->stats); kfree(dd); } /* * initialize elevator private data (deadline_data). */ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; enum dd_prio prio; int ret = -ENOMEM; eq = elevator_alloc(q, e); if (!eq) return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) goto put_eq; eq->elevator_data = dd; dd->stats = alloc_percpu_gfp(typeof(*dd->stats), GFP_KERNEL | __GFP_ZERO); if (!dd->stats) goto free_dd; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; INIT_LIST_HEAD(&per_prio->dispatch); INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); per_prio->sort_list[DD_READ] = RB_ROOT; per_prio->sort_list[DD_WRITE] = RB_ROOT; } dd->fifo_expire[DD_READ] = read_expire; dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); q->elevator = eq; return 0; free_dd: kfree(dd); put_eq: kobject_put(&eq->kobj); return ret; } /* * Try to merge @bio into an existing request. If @bio has been merged into * an existing request, store the pointer to that request into *@rq. */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; if (blk_discard_mergable(__rq)) return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } return ELEVATOR_NO_MERGE; } /* * Attempt to merge a bio into an existing request. This function is called * before @bio is associated with a request. */ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; spin_lock(&dd->lock); ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); if (free) blk_mq_free_request(free); return ret; } /* * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); u16 ioprio = req_get_ioprio(rq); u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; LIST_HEAD(free); lockdep_assert_held(&dd->lock); /* * This may be a requeue of a write request that has locked its * target zone. If it is the case, this releases the zone lock. */ blk_req_zone_write_unlock(rq); prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, inserted, prio); rq->elv.priv[0] = (void *)(uintptr_t)1; if (blk_mq_sched_try_insert_merge(q, rq, &free)) { blk_mq_free_requests(&free); return; } trace_block_rq_insert(rq); per_prio = &dd->per_prio[prio]; if (at_head) { list_add(&rq->queuelist, &per_prio->dispatch); } else { deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); if (!q->last_merge) q->last_merge = rq; } /* * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); } } /* * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool at_head) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, at_head); } spin_unlock(&dd->lock); } /* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { rq->elv.priv[0] = NULL; } static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio p; for (p = 0; p <= DD_PRIO_MAX; p++) if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) return true; return false; } /* * Callback from inside blk_mq_free_request(). * * For zoned block devices, write unlock the target zone of * completed write requests. Do this while holding the zone lock * spinlock so that the zone is never unlocked while deadline_fifo_request() * or deadline_next_request() are executing. This function is called for * all requests, whether or not these requests complete successfully. * * For a zoned block device, __dd_dispatch_request() may have stopped * dispatching requests if all the queued requests are write requests directed * at zones that are already locked due to on-going write requests. To ensure * write request dispatch progress in this case, mark the queue as needing a * restart to ensure that the queue is run again after completion of the * request and zones being unlocked. */ static void dd_finish_request(struct request *rq) { struct request_queue *q = rq->q; struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(rq); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; /* * The block layer core may call dd_finish_request() without having * called dd_insert_requests(). Hence only update statistics for * requests for which dd_insert_requests() has been called. See also * blk_mq_request_bypass_insert(). */ if (rq->elv.priv[0]) dd_count(dd, completed, prio); if (blk_queue_is_zoned(q)) { unsigned long flags; spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); spin_unlock_irqrestore(&dd->zone_lock, flags); if (dd_has_write_work(rq->mq_hctx)) blk_mq_sched_mark_restart_hctx(rq->mq_hctx); } } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) { return !list_empty_careful(&per_prio->dispatch) || !list_empty_careful(&per_prio->fifo_list[DD_READ]) || !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); } static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; return false; } /* * sysfs parts below */ #define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ \ return sysfs_emit(page, "%d\n", __VAR); \ } #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ int __data, __ret; \ \ __ret = kstrtoint(page, 0, &__data); \ if (__ret < 0) \ return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = __CONV(__data); \ return count; \ } #define STORE_INT(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) #define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT #undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), __ATTR_NULL }; #ifdef CONFIG_BLK_DEBUG_FS #define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ .start = deadline_##name##_fifo_start, \ .next = deadline_##name##_fifo_next, \ .stop = deadline_##name##_fifo_stop, \ .show = blk_mq_debugfs_rq_show, \ }; \ \ static int deadline_##name##_next_rq_show(void *data, \ struct seq_file *m) \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ struct request *rq = per_prio->next_rq[data_dir]; \ \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->batching); return 0; } static int deadline_starved_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->starved); return 0; } static int dd_async_depth_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->async_depth); return 0; } /* Number of requests queued for a given priority level. */ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) { return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); } static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO), dd_queued(dd, DD_BE_PRIO), dd_queued(dd, DD_IDLE_PRIO)); return 0; } /* Number of requests owned by the block driver for a given priority. */ static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) { return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) - dd_sum(dd, completed, prio); } static int dd_owned_by_driver_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO), dd_owned_by_driver(dd, DD_BE_PRIO), dd_owned_by_driver(dd, DD_IDLE_PRIO)); return 0; } #define DEADLINE_DISPATCH_ATTR(prio) \ static void *deadline_dispatch##prio##_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->dispatch, *pos); \ } \ \ static void *deadline_dispatch##prio##_next(struct seq_file *m, \ void *v, loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->dispatch, pos); \ } \ \ static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ .start = deadline_dispatch##prio##_start, \ .next = deadline_dispatch##prio##_next, \ .stop = deadline_dispatch##prio##_stop, \ .show = blk_mq_debugfs_rq_show, \ } DEADLINE_DISPATCH_ATTR(0); DEADLINE_DISPATCH_ATTR(1); DEADLINE_DISPATCH_ATTR(2); #undef DEADLINE_DISPATCH_ATTR #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ {#name "_fifo_list", 0400, \ .seq_ops = &deadline_##name##_fifo_seq_ops} #define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_QUEUE_DDIR_ATTRS(read0), DEADLINE_QUEUE_DDIR_ATTRS(write0), DEADLINE_QUEUE_DDIR_ATTRS(read1), DEADLINE_QUEUE_DDIR_ATTRS(write1), DEADLINE_QUEUE_DDIR_ATTRS(read2), DEADLINE_QUEUE_DDIR_ATTRS(write2), DEADLINE_NEXT_RQ_ATTR(read0), DEADLINE_NEXT_RQ_ATTR(write0), DEADLINE_NEXT_RQ_ATTR(read1), DEADLINE_NEXT_RQ_ATTR(write1), DEADLINE_NEXT_RQ_ATTR(read2), DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"async_depth", 0400, dd_async_depth_show}, {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS #endif static struct elevator_type mq_deadline = { .ops = { .depth_updated = dd_depth_updated, .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, .finish_request = dd_finish_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, .request_merge = dd_request_merge, .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { return elv_register(&mq_deadline); } static void __exit deadline_exit(void) { elv_unregister(&mq_deadline); } module_init(deadline_init); module_exit(deadline_exit); MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler"); |
38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_UNALIGNED_H #define __ASM_GENERIC_UNALIGNED_H /* * This is the most generic implementation of unaligned accesses * and should work almost anywhere. */ #include <linux/unaligned/packed_struct.h> #include <asm/byteorder.h> #define __get_unaligned_t(type, ptr) ({ \ const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \ __pptr->x; \ }) #define __put_unaligned_t(type, val, ptr) do { \ struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \ __pptr->x = (val); \ } while (0) #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) #define put_unaligned(val, ptr) __put_unaligned_t(typeof(*(ptr)), (val), (ptr)) static inline u16 get_unaligned_le16(const void *p) { return le16_to_cpu(__get_unaligned_t(__le16, p)); } static inline u32 get_unaligned_le32(const void *p) { return le32_to_cpu(__get_unaligned_t(__le32, p)); } static inline u64 get_unaligned_le64(const void *p) { return le64_to_cpu(__get_unaligned_t(__le64, p)); } static inline void put_unaligned_le16(u16 val, void *p) { __put_unaligned_t(__le16, cpu_to_le16(val), p); } static inline void put_unaligned_le32(u32 val, void *p) { __put_unaligned_t(__le32, cpu_to_le32(val), p); } static inline void put_unaligned_le64(u64 val, void *p) { __put_unaligned_t(__le64, cpu_to_le64(val), p); } static inline u16 get_unaligned_be16(const void *p) { return be16_to_cpu(__get_unaligned_t(__be16, p)); } static inline u32 get_unaligned_be32(const void *p) { return be32_to_cpu(__get_unaligned_t(__be32, p)); } static inline u64 get_unaligned_be64(const void *p) { return be64_to_cpu(__get_unaligned_t(__be64, p)); } static inline void put_unaligned_be16(u16 val, void *p) { __put_unaligned_t(__be16, cpu_to_be16(val), p); } static inline void put_unaligned_be32(u32 val, void *p) { __put_unaligned_t(__be32, cpu_to_be32(val), p); } static inline void put_unaligned_be64(u64 val, void *p) { __put_unaligned_t(__be64, cpu_to_be64(val), p); } static inline u32 __get_unaligned_be24(const u8 *p) { return p[0] << 16 | p[1] << 8 | p[2]; } static inline u32 get_unaligned_be24(const void *p) { return __get_unaligned_be24(p); } static inline u32 __get_unaligned_le24(const u8 *p) { return p[0] | p[1] << 8 | p[2] << 16; } static inline u32 get_unaligned_le24(const void *p) { return __get_unaligned_le24(p); } static inline void __put_unaligned_be24(const u32 val, u8 *p) { *p++ = val >> 16; *p++ = val >> 8; *p++ = val; } static inline void put_unaligned_be24(const u32 val, void *p) { __put_unaligned_be24(val, p); } static inline void __put_unaligned_le24(const u32 val, u8 *p) { *p++ = val; *p++ = val >> 8; *p++ = val >> 16; } static inline void put_unaligned_le24(const u32 val, void *p) { __put_unaligned_le24(val, p); } #endif /* __ASM_GENERIC_UNALIGNED_H */ |
48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* AFS tracepoints * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #undef TRACE_SYSTEM #define TRACE_SYSTEM afs #if !defined(_TRACE_AFS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_AFS_H #include <linux/tracepoint.h> /* * Define enums for tracing information. */ #ifndef __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY enum afs_call_trace { afs_call_trace_alloc, afs_call_trace_free, afs_call_trace_get, afs_call_trace_put, afs_call_trace_wake, afs_call_trace_work, }; enum afs_server_trace { afs_server_trace_alloc, afs_server_trace_callback, afs_server_trace_destroy, afs_server_trace_free, afs_server_trace_gc, afs_server_trace_get_by_addr, afs_server_trace_get_by_uuid, afs_server_trace_get_caps, afs_server_trace_get_install, afs_server_trace_get_new_cbi, afs_server_trace_get_probe, afs_server_trace_give_up_cb, afs_server_trace_purging, afs_server_trace_put_call, afs_server_trace_put_cbi, afs_server_trace_put_find_rsq, afs_server_trace_put_probe, afs_server_trace_put_slist, afs_server_trace_put_slist_isort, afs_server_trace_put_uuid_rsq, afs_server_trace_update, }; enum afs_volume_trace { afs_volume_trace_alloc, afs_volume_trace_free, afs_volume_trace_get_alloc_sbi, afs_volume_trace_get_cell_insert, afs_volume_trace_get_new_op, afs_volume_trace_get_query_alias, afs_volume_trace_put_cell_dup, afs_volume_trace_put_cell_root, afs_volume_trace_put_destroy_sbi, afs_volume_trace_put_free_fc, afs_volume_trace_put_put_op, afs_volume_trace_put_query_alias, afs_volume_trace_put_validate_fc, afs_volume_trace_remove, }; enum afs_cell_trace { afs_cell_trace_alloc, afs_cell_trace_free, afs_cell_trace_get_queue_dns, afs_cell_trace_get_queue_manage, afs_cell_trace_get_queue_new, afs_cell_trace_get_vol, afs_cell_trace_insert, afs_cell_trace_manage, afs_cell_trace_put_candidate, afs_cell_trace_put_destroy, afs_cell_trace_put_queue_fail, afs_cell_trace_put_queue_work, afs_cell_trace_put_vol, afs_cell_trace_see_source, afs_cell_trace_see_ws, afs_cell_trace_unuse_alias, afs_cell_trace_unuse_check_alias, afs_cell_trace_unuse_delete, afs_cell_trace_unuse_fc, afs_cell_trace_unuse_lookup, afs_cell_trace_unuse_mntpt, afs_cell_trace_unuse_no_pin, afs_cell_trace_unuse_parse, afs_cell_trace_unuse_pin, afs_cell_trace_unuse_probe, afs_cell_trace_unuse_sbi, afs_cell_trace_unuse_ws, afs_cell_trace_use_alias, afs_cell_trace_use_check_alias, afs_cell_trace_use_fc, afs_cell_trace_use_fc_alias, afs_cell_trace_use_lookup, afs_cell_trace_use_mntpt, afs_cell_trace_use_pin, afs_cell_trace_use_probe, afs_cell_trace_use_sbi, afs_cell_trace_wait, }; enum afs_fs_operation { afs_FS_FetchData = 130, /* AFS Fetch file data */ afs_FS_FetchACL = 131, /* AFS Fetch file ACL */ afs_FS_FetchStatus = 132, /* AFS Fetch file status */ afs_FS_StoreData = 133, /* AFS Store file data */ afs_FS_StoreACL = 134, /* AFS Store file ACL */ afs_FS_StoreStatus = 135, /* AFS Store file status */ afs_FS_RemoveFile = 136, /* AFS Remove a file */ afs_FS_CreateFile = 137, /* AFS Create a file */ afs_FS_Rename = 138, /* AFS Rename or move a file or directory */ afs_FS_Symlink = 139, /* AFS Create a symbolic link */ afs_FS_Link = 140, /* AFS Create a hard link */ afs_FS_MakeDir = 141, /* AFS Create a directory */ afs_FS_RemoveDir = 142, /* AFS Remove a directory */ afs_FS_GetVolumeInfo = 148, /* AFS Get information about a volume */ afs_FS_GetVolumeStatus = 149, /* AFS Get volume status information */ afs_FS_GetRootVolume = 151, /* AFS Get root volume name */ afs_FS_SetLock = 156, /* AFS Request a file lock */ afs_FS_ExtendLock = 157, /* AFS Extend a file lock */ afs_FS_ReleaseLock = 158, /* AFS Release a file lock */ afs_FS_Lookup = 161, /* AFS lookup file in directory */ afs_FS_InlineBulkStatus = 65536, /* AFS Fetch multiple file statuses with errors */ afs_FS_FetchData64 = 65537, /* AFS Fetch file data */ afs_FS_StoreData64 = 65538, /* AFS Store file data */ afs_FS_GiveUpAllCallBacks = 65539, /* AFS Give up all our callbacks on a server */ afs_FS_GetCapabilities = 65540, /* AFS Get FS server capabilities */ yfs_FS_FetchData = 130, /* YFS Fetch file data */ yfs_FS_FetchACL = 64131, /* YFS Fetch file ACL */ yfs_FS_FetchStatus = 64132, /* YFS Fetch file status */ yfs_FS_StoreACL = 64134, /* YFS Store file ACL */ yfs_FS_StoreStatus = 64135, /* YFS Store file status */ yfs_FS_RemoveFile = 64136, /* YFS Remove a file */ yfs_FS_CreateFile = 64137, /* YFS Create a file */ yfs_FS_Rename = 64138, /* YFS Rename or move a file or directory */ yfs_FS_Symlink = 64139, /* YFS Create a symbolic link */ yfs_FS_Link = 64140, /* YFS Create a hard link */ yfs_FS_MakeDir = 64141, /* YFS Create a directory */ yfs_FS_RemoveDir = 64142, /* YFS Remove a directory */ yfs_FS_GetVolumeStatus = 64149, /* YFS Get volume status information */ yfs_FS_SetVolumeStatus = 64150, /* YFS Set volume status information */ yfs_FS_SetLock = 64156, /* YFS Request a file lock */ yfs_FS_ExtendLock = 64157, /* YFS Extend a file lock */ yfs_FS_ReleaseLock = 64158, /* YFS Release a file lock */ yfs_FS_Lookup = 64161, /* YFS lookup file in directory */ yfs_FS_FlushCPS = 64165, yfs_FS_FetchOpaqueACL = 64168, yfs_FS_WhoAmI = 64170, yfs_FS_RemoveACL = 64171, yfs_FS_RemoveFile2 = 64173, yfs_FS_StoreOpaqueACL2 = 64174, yfs_FS_InlineBulkStatus = 64536, /* YFS Fetch multiple file statuses with errors */ yfs_FS_FetchData64 = 64537, /* YFS Fetch file data */ yfs_FS_StoreData64 = 64538, /* YFS Store file data */ yfs_FS_UpdateSymlink = 64540, }; enum afs_vl_operation { afs_VL_GetEntryByNameU = 527, /* AFS Get Vol Entry By Name operation ID */ afs_VL_GetAddrsU = 533, /* AFS Get FS server addresses */ afs_YFSVL_GetEndpoints = 64002, /* YFS Get FS & Vol server addresses */ afs_YFSVL_GetCellName = 64014, /* YFS Get actual cell name */ afs_VL_GetCapabilities = 65537, /* AFS Get VL server capabilities */ }; enum afs_cm_operation { afs_CB_CallBack = 204, /* AFS break callback promises */ afs_CB_InitCallBackState = 205, /* AFS initialise callback state */ afs_CB_Probe = 206, /* AFS probe client */ afs_CB_GetLock = 207, /* AFS get contents of CM lock table */ afs_CB_GetCE = 208, /* AFS get cache file description */ afs_CB_GetXStatsVersion = 209, /* AFS get version of extended statistics */ afs_CB_GetXStats = 210, /* AFS get contents of extended statistics data */ afs_CB_InitCallBackState3 = 213, /* AFS initialise callback state, version 3 */ afs_CB_ProbeUuid = 214, /* AFS check the client hasn't rebooted */ }; enum yfs_cm_operation { yfs_CB_Probe = 206, /* YFS probe client */ yfs_CB_GetLock = 207, /* YFS get contents of CM lock table */ yfs_CB_XStatsVersion = 209, /* YFS get version of extended statistics */ yfs_CB_GetXStats = 210, /* YFS get contents of extended statistics data */ yfs_CB_InitCallBackState3 = 213, /* YFS initialise callback state, version 3 */ yfs_CB_ProbeUuid = 214, /* YFS check the client hasn't rebooted */ yfs_CB_GetServerPrefs = 215, yfs_CB_GetCellServDV = 216, yfs_CB_GetLocalCell = 217, yfs_CB_GetCacheConfig = 218, yfs_CB_GetCellByNum = 65537, yfs_CB_TellMeAboutYourself = 65538, /* get client capabilities */ yfs_CB_CallBack = 64204, }; enum afs_edit_dir_op { afs_edit_dir_create, afs_edit_dir_create_error, afs_edit_dir_create_inval, afs_edit_dir_create_nospc, afs_edit_dir_delete, afs_edit_dir_delete_error, afs_edit_dir_delete_inval, afs_edit_dir_delete_noent, }; enum afs_edit_dir_reason { afs_edit_dir_for_create, afs_edit_dir_for_link, afs_edit_dir_for_mkdir, afs_edit_dir_for_rename_0, afs_edit_dir_for_rename_1, afs_edit_dir_for_rename_2, afs_edit_dir_for_rmdir, afs_edit_dir_for_silly_0, afs_edit_dir_for_silly_1, afs_edit_dir_for_symlink, afs_edit_dir_for_unlink, }; enum afs_eproto_cause { afs_eproto_bad_status, afs_eproto_cb_count, afs_eproto_cb_fid_count, afs_eproto_cellname_len, afs_eproto_file_type, afs_eproto_ibulkst_cb_count, afs_eproto_ibulkst_count, afs_eproto_motd_len, afs_eproto_offline_msg_len, afs_eproto_volname_len, afs_eproto_yvl_fsendpt4_len, afs_eproto_yvl_fsendpt6_len, afs_eproto_yvl_fsendpt_num, afs_eproto_yvl_fsendpt_type, afs_eproto_yvl_vlendpt4_len, afs_eproto_yvl_vlendpt6_len, afs_eproto_yvl_vlendpt_type, }; enum afs_io_error { afs_io_error_cm_reply, afs_io_error_extract, afs_io_error_fs_probe_fail, afs_io_error_vl_lookup_fail, afs_io_error_vl_probe_fail, }; enum afs_file_error { afs_file_error_dir_bad_magic, afs_file_error_dir_big, afs_file_error_dir_missing_page, afs_file_error_dir_name_too_long, afs_file_error_dir_over_end, afs_file_error_dir_small, afs_file_error_dir_unmarked_ext, afs_file_error_mntpt, afs_file_error_writeback_fail, }; enum afs_flock_event { afs_flock_acquired, afs_flock_callback_break, afs_flock_defer_unlock, afs_flock_extend_fail, afs_flock_fail_other, afs_flock_fail_perm, afs_flock_no_lockers, afs_flock_release_fail, afs_flock_silly_delete, afs_flock_timestamp, afs_flock_try_to_lock, afs_flock_vfs_lock, afs_flock_vfs_locking, afs_flock_waited, afs_flock_waiting, afs_flock_work_extending, afs_flock_work_retry, afs_flock_work_unlocking, afs_flock_would_block, }; enum afs_flock_operation { afs_flock_op_copy_lock, afs_flock_op_flock, afs_flock_op_grant, afs_flock_op_lock, afs_flock_op_release_lock, afs_flock_op_return_ok, afs_flock_op_return_eagain, afs_flock_op_return_edeadlk, afs_flock_op_return_error, afs_flock_op_set_lock, afs_flock_op_unlock, afs_flock_op_wake, }; enum afs_cb_break_reason { afs_cb_break_no_break, afs_cb_break_no_promise, afs_cb_break_for_callback, afs_cb_break_for_deleted, afs_cb_break_for_lapsed, afs_cb_break_for_s_reinit, afs_cb_break_for_unlink, afs_cb_break_for_v_break, afs_cb_break_for_volume_callback, afs_cb_break_for_zap, }; #endif /* end __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY */ /* * Declare tracing information enums and their string mappings for display. */ #define afs_call_traces \ EM(afs_call_trace_alloc, "ALLOC") \ EM(afs_call_trace_free, "FREE ") \ EM(afs_call_trace_get, "GET ") \ EM(afs_call_trace_put, "PUT ") \ EM(afs_call_trace_wake, "WAKE ") \ E_(afs_call_trace_work, "QUEUE") #define afs_server_traces \ EM(afs_server_trace_alloc, "ALLOC ") \ EM(afs_server_trace_callback, "CALLBACK ") \ EM(afs_server_trace_destroy, "DESTROY ") \ EM(afs_server_trace_free, "FREE ") \ EM(afs_server_trace_gc, "GC ") \ EM(afs_server_trace_get_by_addr, "GET addr ") \ EM(afs_server_trace_get_by_uuid, "GET uuid ") \ EM(afs_server_trace_get_caps, "GET caps ") \ EM(afs_server_trace_get_install, "GET inst ") \ EM(afs_server_trace_get_new_cbi, "GET cbi ") \ EM(afs_server_trace_get_probe, "GET probe") \ EM(afs_server_trace_give_up_cb, "giveup-cb") \ EM(afs_server_trace_purging, "PURGE ") \ EM(afs_server_trace_put_call, "PUT call ") \ EM(afs_server_trace_put_cbi, "PUT cbi ") \ EM(afs_server_trace_put_find_rsq, "PUT f-rsq") \ EM(afs_server_trace_put_probe, "PUT probe") \ EM(afs_server_trace_put_slist, "PUT slist") \ EM(afs_server_trace_put_slist_isort, "PUT isort") \ EM(afs_server_trace_put_uuid_rsq, "PUT u-req") \ E_(afs_server_trace_update, "UPDATE") #define afs_volume_traces \ EM(afs_volume_trace_alloc, "ALLOC ") \ EM(afs_volume_trace_free, "FREE ") \ EM(afs_volume_trace_get_alloc_sbi, "GET sbi-alloc ") \ EM(afs_volume_trace_get_cell_insert, "GET cell-insrt") \ EM(afs_volume_trace_get_new_op, "GET op-new ") \ EM(afs_volume_trace_get_query_alias, "GET cell-alias") \ EM(afs_volume_trace_put_cell_dup, "PUT cell-dup ") \ EM(afs_volume_trace_put_cell_root, "PUT cell-root ") \ EM(afs_volume_trace_put_destroy_sbi, "PUT sbi-destry") \ EM(afs_volume_trace_put_free_fc, "PUT fc-free ") \ EM(afs_volume_trace_put_put_op, "PUT op-put ") \ EM(afs_volume_trace_put_query_alias, "PUT cell-alias") \ EM(afs_volume_trace_put_validate_fc, "PUT fc-validat") \ E_(afs_volume_trace_remove, "REMOVE ") #define afs_cell_traces \ EM(afs_cell_trace_alloc, "ALLOC ") \ EM(afs_cell_trace_free, "FREE ") \ EM(afs_cell_trace_get_queue_dns, "GET q-dns ") \ EM(afs_cell_trace_get_queue_manage, "GET q-mng ") \ EM(afs_cell_trace_get_queue_new, "GET q-new ") \ EM(afs_cell_trace_get_vol, "GET vol ") \ EM(afs_cell_trace_insert, "INSERT ") \ EM(afs_cell_trace_manage, "MANAGE ") \ EM(afs_cell_trace_put_candidate, "PUT candid") \ EM(afs_cell_trace_put_destroy, "PUT destry") \ EM(afs_cell_trace_put_queue_work, "PUT q-work") \ EM(afs_cell_trace_put_queue_fail, "PUT q-fail") \ EM(afs_cell_trace_put_vol, "PUT vol ") \ EM(afs_cell_trace_see_source, "SEE source") \ EM(afs_cell_trace_see_ws, "SEE ws ") \ EM(afs_cell_trace_unuse_alias, "UNU alias ") \ EM(afs_cell_trace_unuse_check_alias, "UNU chk-al") \ EM(afs_cell_trace_unuse_delete, "UNU delete") \ EM(afs_cell_trace_unuse_fc, "UNU fc ") \ EM(afs_cell_trace_unuse_lookup, "UNU lookup") \ EM(afs_cell_trace_unuse_mntpt, "UNU mntpt ") \ EM(afs_cell_trace_unuse_parse, "UNU parse ") \ EM(afs_cell_trace_unuse_pin, "UNU pin ") \ EM(afs_cell_trace_unuse_probe, "UNU probe ") \ EM(afs_cell_trace_unuse_sbi, "UNU sbi ") \ EM(afs_cell_trace_unuse_ws, "UNU ws ") \ EM(afs_cell_trace_use_alias, "USE alias ") \ EM(afs_cell_trace_use_check_alias, "USE chk-al") \ EM(afs_cell_trace_use_fc, "USE fc ") \ EM(afs_cell_trace_use_fc_alias, "USE fc-al ") \ EM(afs_cell_trace_use_lookup, "USE lookup") \ EM(afs_cell_trace_use_mntpt, "USE mntpt ") \ EM(afs_cell_trace_use_pin, "USE pin ") \ EM(afs_cell_trace_use_probe, "USE probe ") \ EM(afs_cell_trace_use_sbi, "USE sbi ") \ E_(afs_cell_trace_wait, "WAIT ") #define afs_fs_operations \ EM(afs_FS_FetchData, "FS.FetchData") \ EM(afs_FS_FetchStatus, "FS.FetchStatus") \ EM(afs_FS_StoreData, "FS.StoreData") \ EM(afs_FS_StoreStatus, "FS.StoreStatus") \ EM(afs_FS_RemoveFile, "FS.RemoveFile") \ EM(afs_FS_CreateFile, "FS.CreateFile") \ EM(afs_FS_Rename, "FS.Rename") \ EM(afs_FS_Symlink, "FS.Symlink") \ EM(afs_FS_Link, "FS.Link") \ EM(afs_FS_MakeDir, "FS.MakeDir") \ EM(afs_FS_RemoveDir, "FS.RemoveDir") \ EM(afs_FS_GetVolumeInfo, "FS.GetVolumeInfo") \ EM(afs_FS_GetVolumeStatus, "FS.GetVolumeStatus") \ EM(afs_FS_GetRootVolume, "FS.GetRootVolume") \ EM(afs_FS_SetLock, "FS.SetLock") \ EM(afs_FS_ExtendLock, "FS.ExtendLock") \ EM(afs_FS_ReleaseLock, "FS.ReleaseLock") \ EM(afs_FS_Lookup, "FS.Lookup") \ EM(afs_FS_InlineBulkStatus, "FS.InlineBulkStatus") \ EM(afs_FS_FetchData64, "FS.FetchData64") \ EM(afs_FS_StoreData64, "FS.StoreData64") \ EM(afs_FS_GiveUpAllCallBacks, "FS.GiveUpAllCallBacks") \ EM(afs_FS_GetCapabilities, "FS.GetCapabilities") \ EM(yfs_FS_FetchACL, "YFS.FetchACL") \ EM(yfs_FS_FetchStatus, "YFS.FetchStatus") \ EM(yfs_FS_StoreACL, "YFS.StoreACL") \ EM(yfs_FS_StoreStatus, "YFS.StoreStatus") \ EM(yfs_FS_RemoveFile, "YFS.RemoveFile") \ EM(yfs_FS_CreateFile, "YFS.CreateFile") \ EM(yfs_FS_Rename, "YFS.Rename") \ EM(yfs_FS_Symlink, "YFS.Symlink") \ EM(yfs_FS_Link, "YFS.Link") \ EM(yfs_FS_MakeDir, "YFS.MakeDir") \ EM(yfs_FS_RemoveDir, "YFS.RemoveDir") \ EM(yfs_FS_GetVolumeStatus, "YFS.GetVolumeStatus") \ EM(yfs_FS_SetVolumeStatus, "YFS.SetVolumeStatus") \ EM(yfs_FS_SetLock, "YFS.SetLock") \ EM(yfs_FS_ExtendLock, "YFS.ExtendLock") \ EM(yfs_FS_ReleaseLock, "YFS.ReleaseLock") \ EM(yfs_FS_Lookup, "YFS.Lookup") \ EM(yfs_FS_FlushCPS, "YFS.FlushCPS") \ EM(yfs_FS_FetchOpaqueACL, "YFS.FetchOpaqueACL") \ EM(yfs_FS_WhoAmI, "YFS.WhoAmI") \ EM(yfs_FS_RemoveACL, "YFS.RemoveACL") \ EM(yfs_FS_RemoveFile2, "YFS.RemoveFile2") \ EM(yfs_FS_StoreOpaqueACL2, "YFS.StoreOpaqueACL2") \ EM(yfs_FS_InlineBulkStatus, "YFS.InlineBulkStatus") \ EM(yfs_FS_FetchData64, "YFS.FetchData64") \ EM(yfs_FS_StoreData64, "YFS.StoreData64") \ E_(yfs_FS_UpdateSymlink, "YFS.UpdateSymlink") #define afs_vl_operations \ EM(afs_VL_GetEntryByNameU, "VL.GetEntryByNameU") \ EM(afs_VL_GetAddrsU, "VL.GetAddrsU") \ EM(afs_YFSVL_GetEndpoints, "YFSVL.GetEndpoints") \ EM(afs_YFSVL_GetCellName, "YFSVL.GetCellName") \ E_(afs_VL_GetCapabilities, "VL.GetCapabilities") #define afs_cm_operations \ EM(afs_CB_CallBack, "CB.CallBack") \ EM(afs_CB_InitCallBackState, "CB.InitCallBackState") \ EM(afs_CB_Probe, "CB.Probe") \ EM(afs_CB_GetLock, "CB.GetLock") \ EM(afs_CB_GetCE, "CB.GetCE") \ EM(afs_CB_GetXStatsVersion, "CB.GetXStatsVersion") \ EM(afs_CB_GetXStats, "CB.GetXStats") \ EM(afs_CB_InitCallBackState3, "CB.InitCallBackState3") \ E_(afs_CB_ProbeUuid, "CB.ProbeUuid") #define yfs_cm_operations \ EM(yfs_CB_Probe, "YFSCB.Probe") \ EM(yfs_CB_GetLock, "YFSCB.GetLock") \ EM(yfs_CB_XStatsVersion, "YFSCB.XStatsVersion") \ EM(yfs_CB_GetXStats, "YFSCB.GetXStats") \ EM(yfs_CB_InitCallBackState3, "YFSCB.InitCallBackState3") \ EM(yfs_CB_ProbeUuid, "YFSCB.ProbeUuid") \ EM(yfs_CB_GetServerPrefs, "YFSCB.GetServerPrefs") \ EM(yfs_CB_GetCellServDV, "YFSCB.GetCellServDV") \ EM(yfs_CB_GetLocalCell, "YFSCB.GetLocalCell") \ EM(yfs_CB_GetCacheConfig, "YFSCB.GetCacheConfig") \ EM(yfs_CB_GetCellByNum, "YFSCB.GetCellByNum") \ EM(yfs_CB_TellMeAboutYourself, "YFSCB.TellMeAboutYourself") \ E_(yfs_CB_CallBack, "YFSCB.CallBack") #define afs_edit_dir_ops \ EM(afs_edit_dir_create, "create") \ EM(afs_edit_dir_create_error, "c_fail") \ EM(afs_edit_dir_create_inval, "c_invl") \ EM(afs_edit_dir_create_nospc, "c_nspc") \ EM(afs_edit_dir_delete, "delete") \ EM(afs_edit_dir_delete_error, "d_err ") \ EM(afs_edit_dir_delete_inval, "d_invl") \ E_(afs_edit_dir_delete_noent, "d_nent") #define afs_edit_dir_reasons \ EM(afs_edit_dir_for_create, "Create") \ EM(afs_edit_dir_for_link, "Link ") \ EM(afs_edit_dir_for_mkdir, "MkDir ") \ EM(afs_edit_dir_for_rename_0, "Renam0") \ EM(afs_edit_dir_for_rename_1, "Renam1") \ EM(afs_edit_dir_for_rename_2, "Renam2") \ EM(afs_edit_dir_for_rmdir, "RmDir ") \ EM(afs_edit_dir_for_silly_0, "S_Ren0") \ EM(afs_edit_dir_for_silly_1, "S_Ren1") \ EM(afs_edit_dir_for_symlink, "Symlnk") \ E_(afs_edit_dir_for_unlink, "Unlink") #define afs_eproto_causes \ EM(afs_eproto_bad_status, "BadStatus") \ EM(afs_eproto_cb_count, "CbCount") \ EM(afs_eproto_cb_fid_count, "CbFidCount") \ EM(afs_eproto_cellname_len, "CellNameLen") \ EM(afs_eproto_file_type, "FileTYpe") \ EM(afs_eproto_ibulkst_cb_count, "IBS.CbCount") \ EM(afs_eproto_ibulkst_count, "IBS.FidCount") \ EM(afs_eproto_motd_len, "MotdLen") \ EM(afs_eproto_offline_msg_len, "OfflineMsgLen") \ EM(afs_eproto_volname_len, "VolNameLen") \ EM(afs_eproto_yvl_fsendpt4_len, "YVL.FsEnd4Len") \ EM(afs_eproto_yvl_fsendpt6_len, "YVL.FsEnd6Len") \ EM(afs_eproto_yvl_fsendpt_num, "YVL.FsEndCount") \ EM(afs_eproto_yvl_fsendpt_type, "YVL.FsEndType") \ EM(afs_eproto_yvl_vlendpt4_len, "YVL.VlEnd4Len") \ EM(afs_eproto_yvl_vlendpt6_len, "YVL.VlEnd6Len") \ E_(afs_eproto_yvl_vlendpt_type, "YVL.VlEndType") #define afs_io_errors \ EM(afs_io_error_cm_reply, "CM_REPLY") \ EM(afs_io_error_extract, "EXTRACT") \ EM(afs_io_error_fs_probe_fail, "FS_PROBE_FAIL") \ EM(afs_io_error_vl_lookup_fail, "VL_LOOKUP_FAIL") \ E_(afs_io_error_vl_probe_fail, "VL_PROBE_FAIL") #define afs_file_errors \ EM(afs_file_error_dir_bad_magic, "DIR_BAD_MAGIC") \ EM(afs_file_error_dir_big, "DIR_BIG") \ EM(afs_file_error_dir_missing_page, "DIR_MISSING_PAGE") \ EM(afs_file_error_dir_name_too_long, "DIR_NAME_TOO_LONG") \ EM(afs_file_error_dir_over_end, "DIR_ENT_OVER_END") \ EM(afs_file_error_dir_small, "DIR_SMALL") \ EM(afs_file_error_dir_unmarked_ext, "DIR_UNMARKED_EXT") \ EM(afs_file_error_mntpt, "MNTPT_READ_FAILED") \ E_(afs_file_error_writeback_fail, "WRITEBACK_FAILED") #define afs_flock_types \ EM(F_RDLCK, "RDLCK") \ EM(F_WRLCK, "WRLCK") \ E_(F_UNLCK, "UNLCK") #define afs_flock_states \ EM(AFS_VNODE_LOCK_NONE, "NONE") \ EM(AFS_VNODE_LOCK_WAITING_FOR_CB, "WAIT_FOR_CB") \ EM(AFS_VNODE_LOCK_SETTING, "SETTING") \ EM(AFS_VNODE_LOCK_GRANTED, "GRANTED") \ EM(AFS_VNODE_LOCK_EXTENDING, "EXTENDING") \ EM(AFS_VNODE_LOCK_NEED_UNLOCK, "NEED_UNLOCK") \ EM(AFS_VNODE_LOCK_UNLOCKING, "UNLOCKING") \ E_(AFS_VNODE_LOCK_DELETED, "DELETED") #define afs_flock_events \ EM(afs_flock_acquired, "Acquired") \ EM(afs_flock_callback_break, "Callback") \ EM(afs_flock_defer_unlock, "D-Unlock") \ EM(afs_flock_extend_fail, "Ext_Fail") \ EM(afs_flock_fail_other, "ErrOther") \ EM(afs_flock_fail_perm, "ErrPerm ") \ EM(afs_flock_no_lockers, "NoLocker") \ EM(afs_flock_release_fail, "Rel_Fail") \ EM(afs_flock_silly_delete, "SillyDel") \ EM(afs_flock_timestamp, "Timestmp") \ EM(afs_flock_try_to_lock, "TryToLck") \ EM(afs_flock_vfs_lock, "VFSLock ") \ EM(afs_flock_vfs_locking, "VFSLking") \ EM(afs_flock_waited, "Waited ") \ EM(afs_flock_waiting, "Waiting ") \ EM(afs_flock_work_extending, "Extendng") \ EM(afs_flock_work_retry, "Retry ") \ EM(afs_flock_work_unlocking, "Unlcking") \ E_(afs_flock_would_block, "EWOULDBL") #define afs_flock_operations \ EM(afs_flock_op_copy_lock, "COPY ") \ EM(afs_flock_op_flock, "->flock ") \ EM(afs_flock_op_grant, "GRANT ") \ EM(afs_flock_op_lock, "->lock ") \ EM(afs_flock_op_release_lock, "RELEASE ") \ EM(afs_flock_op_return_ok, "<-OK ") \ EM(afs_flock_op_return_edeadlk, "<-EDEADL") \ EM(afs_flock_op_return_eagain, "<-EAGAIN") \ EM(afs_flock_op_return_error, "<-ERROR ") \ EM(afs_flock_op_set_lock, "SET ") \ EM(afs_flock_op_unlock, "UNLOCK ") \ E_(afs_flock_op_wake, "WAKE ") #define afs_cb_break_reasons \ EM(afs_cb_break_no_break, "no-break") \ EM(afs_cb_break_no_promise, "no-promise") \ EM(afs_cb_break_for_callback, "break-cb") \ EM(afs_cb_break_for_deleted, "break-del") \ EM(afs_cb_break_for_lapsed, "break-lapsed") \ EM(afs_cb_break_for_s_reinit, "s-reinit") \ EM(afs_cb_break_for_unlink, "break-unlink") \ EM(afs_cb_break_for_v_break, "break-v") \ EM(afs_cb_break_for_volume_callback, "break-v-cb") \ E_(afs_cb_break_for_zap, "break-zap") /* * Export enum symbols via userspace. */ #undef EM #undef E_ #define EM(a, b) TRACE_DEFINE_ENUM(a); #define E_(a, b) TRACE_DEFINE_ENUM(a); afs_call_traces; afs_server_traces; afs_cell_traces; afs_fs_operations; afs_vl_operations; afs_cm_operations; yfs_cm_operations; afs_edit_dir_ops; afs_edit_dir_reasons; afs_eproto_causes; afs_io_errors; afs_file_errors; afs_flock_types; afs_flock_operations; afs_cb_break_reasons; /* * Now redefine the EM() and E_() macros to map the enums to the strings that * will be printed in the output. */ #undef EM #undef E_ #define EM(a, b) { a, b }, #define E_(a, b) { a, b } TRACE_EVENT(afs_receive_data, TP_PROTO(struct afs_call *call, struct iov_iter *iter, bool want_more, int ret), TP_ARGS(call, iter, want_more, ret), TP_STRUCT__entry( __field(loff_t, remain ) __field(unsigned int, call ) __field(enum afs_call_state, state ) __field(unsigned short, unmarshall ) __field(bool, want_more ) __field(int, ret ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->state = call->state; __entry->unmarshall = call->unmarshall; __entry->remain = iov_iter_count(iter); __entry->want_more = want_more; __entry->ret = ret; ), TP_printk("c=%08x r=%llu u=%u w=%u s=%u ret=%d", __entry->call, __entry->remain, __entry->unmarshall, __entry->want_more, __entry->state, __entry->ret) ); TRACE_EVENT(afs_notify_call, TP_PROTO(struct rxrpc_call *rxcall, struct afs_call *call), TP_ARGS(rxcall, call), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum afs_call_state, state ) __field(unsigned short, unmarshall ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->state = call->state; __entry->unmarshall = call->unmarshall; ), TP_printk("c=%08x s=%u u=%u", __entry->call, __entry->state, __entry->unmarshall) ); TRACE_EVENT(afs_cb_call, TP_PROTO(struct afs_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call ) __field(u32, op ) __field(u16, service_id ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->op = call->operation_ID; __entry->service_id = call->service_id; ), TP_printk("c=%08x %s", __entry->call, __entry->service_id == 2501 ? __print_symbolic(__entry->op, yfs_cm_operations) : __print_symbolic(__entry->op, afs_cm_operations)) ); TRACE_EVENT(afs_call, TP_PROTO(struct afs_call *call, enum afs_call_trace op, int ref, int outstanding, const void *where), TP_ARGS(call, op, ref, outstanding, where), TP_STRUCT__entry( __field(unsigned int, call ) __field(int, op ) __field(int, ref ) __field(int, outstanding ) __field(const void *, where ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->op = op; __entry->ref = ref; __entry->outstanding = outstanding; __entry->where = where; ), TP_printk("c=%08x %s r=%d o=%d sp=%pSR", __entry->call, __print_symbolic(__entry->op, afs_call_traces), __entry->ref, __entry->outstanding, __entry->where) ); TRACE_EVENT(afs_make_fs_call, TP_PROTO(struct afs_call *call, const struct afs_fid *fid), TP_ARGS(call, fid), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum afs_fs_operation, op ) __field_struct(struct afs_fid, fid ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->op = call->operation_ID; if (fid) { __entry->fid = *fid; } else { __entry->fid.vid = 0; __entry->fid.vnode = 0; __entry->fid.unique = 0; } ), TP_printk("c=%08x %06llx:%06llx:%06x %s", __entry->call, __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, __print_symbolic(__entry->op, afs_fs_operations)) ); TRACE_EVENT(afs_make_fs_calli, TP_PROTO(struct afs_call *call, const struct afs_fid *fid, unsigned int i), TP_ARGS(call, fid, i), TP_STRUCT__entry( __field(unsigned int, call ) __field(unsigned int, i ) __field(enum afs_fs_operation, op ) __field_struct(struct afs_fid, fid ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->i = i; __entry->op = call->operation_ID; if (fid) { __entry->fid = *fid; } else { __entry->fid.vid = 0; __entry->fid.vnode = 0; __entry->fid.unique = 0; } ), TP_printk("c=%08x %06llx:%06llx:%06x %s i=%u", __entry->call, __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, __print_symbolic(__entry->op, afs_fs_operations), __entry->i) ); TRACE_EVENT(afs_make_fs_call1, TP_PROTO(struct afs_call *call, const struct afs_fid *fid, const struct qstr *name), TP_ARGS(call, fid, name), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum afs_fs_operation, op ) __field_struct(struct afs_fid, fid ) __array(char, name, 24 ) ), TP_fast_assign( unsigned int __len = min_t(unsigned int, name->len, 23); __entry->call = call->debug_id; __entry->op = call->operation_ID; if (fid) { __entry->fid = *fid; } else { __entry->fid.vid = 0; __entry->fid.vnode = 0; __entry->fid.unique = 0; } memcpy(__entry->name, name->name, __len); __entry->name[__len] = 0; ), TP_printk("c=%08x %06llx:%06llx:%06x %s \"%s\"", __entry->call, __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, __print_symbolic(__entry->op, afs_fs_operations), __entry->name) ); TRACE_EVENT(afs_make_fs_call2, TP_PROTO(struct afs_call *call, const struct afs_fid *fid, const struct qstr *name, const struct qstr *name2), TP_ARGS(call, fid, name, name2), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum afs_fs_operation, op ) __field_struct(struct afs_fid, fid ) __array(char, name, 24 ) __array(char, name2, 24 ) ), TP_fast_assign( unsigned int __len = min_t(unsigned int, name->len, 23); unsigned int __len2 = min_t(unsigned int, name2->len, 23); __entry->call = call->debug_id; __entry->op = call->operation_ID; if (fid) { __entry->fid = *fid; } else { __entry->fid.vid = 0; __entry->fid.vnode = 0; __entry->fid.unique = 0; } memcpy(__entry->name, name->name, __len); __entry->name[__len] = 0; memcpy(__entry->name2, name2->name, __len2); __entry->name2[__len2] = 0; ), TP_printk("c=%08x %06llx:%06llx:%06x %s \"%s\" \"%s\"", __entry->call, __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, __print_symbolic(__entry->op, afs_fs_operations), __entry->name, __entry->name2) ); TRACE_EVENT(afs_make_vl_call, TP_PROTO(struct afs_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum afs_vl_operation, op ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->op = call->operation_ID; ), TP_printk("c=%08x %s", __entry->call, __print_symbolic(__entry->op, afs_vl_operations)) ); TRACE_EVENT(afs_call_done, TP_PROTO(struct afs_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call ) __field(struct rxrpc_call *, rx_call ) __field(int, ret ) __field(u32, abort_code ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->rx_call = call->rxcall; __entry->ret = call->error; __entry->abort_code = call->abort_code; ), TP_printk(" c=%08x ret=%d ab=%d [%p]", __entry->call, __entry->ret, __entry->abort_code, __entry->rx_call) ); TRACE_EVENT(afs_send_data, TP_PROTO(struct afs_call *call, struct msghdr *msg), TP_ARGS(call, msg), TP_STRUCT__entry( __field(unsigned int, call ) __field(unsigned int, flags ) __field(loff_t, offset ) __field(loff_t, count ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->flags = msg->msg_flags; __entry->offset = msg->msg_iter.xarray_start + msg->msg_iter.iov_offset; __entry->count = iov_iter_count(&msg->msg_iter); ), TP_printk(" c=%08x o=%llx n=%llx f=%x", __entry->call, __entry->offset, __entry->count, __entry->flags) ); TRACE_EVENT(afs_sent_data, TP_PROTO(struct afs_call *call, struct msghdr *msg, int ret), TP_ARGS(call, msg, ret), TP_STRUCT__entry( __field(unsigned int, call ) __field(int, ret ) __field(loff_t, offset ) __field(loff_t, count ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->ret = ret; __entry->offset = msg->msg_iter.xarray_start + msg->msg_iter.iov_offset; __entry->count = iov_iter_count(&msg->msg_iter); ), TP_printk(" c=%08x o=%llx n=%llx r=%x", __entry->call, __entry->offset, __entry->count, __entry->ret) ); TRACE_EVENT(afs_dir_check_failed, TP_PROTO(struct afs_vnode *vnode, loff_t off, loff_t i_size), TP_ARGS(vnode, off, i_size), TP_STRUCT__entry( __field(struct afs_vnode *, vnode ) __field(loff_t, off ) __field(loff_t, i_size ) ), TP_fast_assign( __entry->vnode = vnode; __entry->off = off; __entry->i_size = i_size; ), TP_printk("vn=%p %llx/%llx", __entry->vnode, __entry->off, __entry->i_size) ); TRACE_EVENT(afs_page_dirty, TP_PROTO(struct afs_vnode *vnode, const char *where, struct page *page), TP_ARGS(vnode, where, page), TP_STRUCT__entry( __field(struct afs_vnode *, vnode ) __field(const char *, where ) __field(pgoff_t, page ) __field(unsigned long, from ) __field(unsigned long, to ) ), TP_fast_assign( __entry->vnode = vnode; __entry->where = where; __entry->page = page->index; __entry->from = afs_page_dirty_from(page, page->private); __entry->to = afs_page_dirty_to(page, page->private); __entry->to |= (afs_is_page_dirty_mmapped(page->private) ? (1UL << (BITS_PER_LONG - 1)) : 0); ), TP_printk("vn=%p %lx %s %lx-%lx%s", __entry->vnode, __entry->page, __entry->where, __entry->from, __entry->to & ~(1UL << (BITS_PER_LONG - 1)), __entry->to & (1UL << (BITS_PER_LONG - 1)) ? " M" : "") ); TRACE_EVENT(afs_call_state, TP_PROTO(struct afs_call *call, enum afs_call_state from, enum afs_call_state to, int ret, u32 remote_abort), TP_ARGS(call, from, to, ret, remote_abort), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum afs_call_state, from ) __field(enum afs_call_state, to ) __field(int, ret ) __field(u32, abort ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->from = from; __entry->to = to; __entry->ret = ret; __entry->abort = remote_abort; ), TP_printk("c=%08x %u->%u r=%d ab=%d", __entry->call, __entry->from, __entry->to, __entry->ret, __entry->abort) ); TRACE_EVENT(afs_lookup, TP_PROTO(struct afs_vnode *dvnode, const struct qstr *name, struct afs_fid *fid), TP_ARGS(dvnode, name, fid), TP_STRUCT__entry( __field_struct(struct afs_fid, dfid ) __field_struct(struct afs_fid, fid ) __array(char, name, 24 ) ), TP_fast_assign( int __len = min_t(int, name->len, 23); __entry->dfid = dvnode->fid; __entry->fid = *fid; memcpy(__entry->name, name->name, __len); __entry->name[__len] = 0; ), TP_printk("d=%llx:%llx:%x \"%s\" f=%llx:%x", __entry->dfid.vid, __entry->dfid.vnode, __entry->dfid.unique, __entry->name, __entry->fid.vnode, __entry->fid.unique) ); TRACE_EVENT(afs_edit_dir, TP_PROTO(struct afs_vnode *dvnode, enum afs_edit_dir_reason why, enum afs_edit_dir_op op, unsigned int block, unsigned int slot, unsigned int f_vnode, unsigned int f_unique, const char *name), TP_ARGS(dvnode, why, op, block, slot, f_vnode, f_unique, name), TP_STRUCT__entry( __field(unsigned int, vnode ) __field(unsigned int, unique ) __field(enum afs_edit_dir_reason, why ) __field(enum afs_edit_dir_op, op ) __field(unsigned int, block ) __field(unsigned short, slot ) __field(unsigned int, f_vnode ) __field(unsigned int, f_unique ) __array(char, name, 24 ) ), TP_fast_assign( int __len = strlen(name); __len = min(__len, 23); __entry->vnode = dvnode->fid.vnode; __entry->unique = dvnode->fid.unique; __entry->why = why; __entry->op = op; __entry->block = block; __entry->slot = slot; __entry->f_vnode = f_vnode; __entry->f_unique = f_unique; memcpy(__entry->name, name, __len); __entry->name[__len] = 0; ), TP_printk("d=%x:%x %s %s %u[%u] f=%x:%x \"%s\"", __entry->vnode, __entry->unique, __print_symbolic(__entry->why, afs_edit_dir_reasons), __print_symbolic(__entry->op, afs_edit_dir_ops), __entry->block, __entry->slot, __entry->f_vnode, __entry->f_unique, __entry->name) ); TRACE_EVENT(afs_protocol_error, TP_PROTO(struct afs_call *call, enum afs_eproto_cause cause), TP_ARGS(call, cause), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum afs_eproto_cause, cause ) ), TP_fast_assign( __entry->call = call ? call->debug_id : 0; __entry->cause = cause; ), TP_printk("c=%08x %s", __entry->call, __print_symbolic(__entry->cause, afs_eproto_causes)) ); TRACE_EVENT(afs_io_error, TP_PROTO(unsigned int call, int error, enum afs_io_error where), TP_ARGS(call, error, where), TP_STRUCT__entry( __field(unsigned int, call ) __field(int, error ) __field(enum afs_io_error, where ) ), TP_fast_assign( __entry->call = call; __entry->error = error; __entry->where = where; ), TP_printk("c=%08x r=%d %s", __entry->call, __entry->error, __print_symbolic(__entry->where, afs_io_errors)) ); TRACE_EVENT(afs_file_error, TP_PROTO(struct afs_vnode *vnode, int error, enum afs_file_error where), TP_ARGS(vnode, error, where), TP_STRUCT__entry( __field_struct(struct afs_fid, fid ) __field(int, error ) __field(enum afs_file_error, where ) ), TP_fast_assign( __entry->fid = vnode->fid; __entry->error = error; __entry->where = where; ), TP_printk("%llx:%llx:%x r=%d %s", __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, __entry->error, __print_symbolic(__entry->where, afs_file_errors)) ); TRACE_EVENT(afs_cm_no_server, TP_PROTO(struct afs_call *call, struct sockaddr_rxrpc *srx), TP_ARGS(call, srx), TP_STRUCT__entry( __field(unsigned int, call ) __field(unsigned int, op_id ) __field_struct(struct sockaddr_rxrpc, srx ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->op_id = call->operation_ID; memcpy(&__entry->srx, srx, sizeof(__entry->srx)); ), TP_printk("c=%08x op=%u %pISpc", __entry->call, __entry->op_id, &__entry->srx.transport) ); TRACE_EVENT(afs_cm_no_server_u, TP_PROTO(struct afs_call *call, const uuid_t *uuid), TP_ARGS(call, uuid), TP_STRUCT__entry( __field(unsigned int, call ) __field(unsigned int, op_id ) __field_struct(uuid_t, uuid ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->op_id = call->operation_ID; memcpy(&__entry->uuid, uuid, sizeof(__entry->uuid)); ), TP_printk("c=%08x op=%u %pU", __entry->call, __entry->op_id, &__entry->uuid) ); TRACE_EVENT(afs_flock_ev, TP_PROTO(struct afs_vnode *vnode, struct file_lock *fl, enum afs_flock_event event, int error), TP_ARGS(vnode, fl, event, error), TP_STRUCT__entry( __field_struct(struct afs_fid, fid ) __field(enum afs_flock_event, event ) __field(enum afs_lock_state, state ) __field(int, error ) __field(unsigned int, debug_id ) ), TP_fast_assign( __entry->fid = vnode->fid; __entry->event = event; __entry->state = vnode->lock_state; __entry->error = error; __entry->debug_id = fl ? fl->fl_u.afs.debug_id : 0; ), TP_printk("%llx:%llx:%x %04x %s s=%s e=%d", __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, __entry->debug_id, __print_symbolic(__entry->event, afs_flock_events), __print_symbolic(__entry->state, afs_flock_states), __entry->error) ); TRACE_EVENT(afs_flock_op, TP_PROTO(struct afs_vnode *vnode, struct file_lock *fl, enum afs_flock_operation op), TP_ARGS(vnode, fl, op), TP_STRUCT__entry( __field_struct(struct afs_fid, fid ) __field(loff_t, from ) __field(loff_t, len ) __field(enum afs_flock_operation, op ) __field(unsigned char, type ) __field(unsigned int, flags ) __field(unsigned int, debug_id ) ), TP_fast_assign( __entry->fid = vnode->fid; __entry->from = fl->fl_start; __entry->len = fl->fl_end - fl->fl_start + 1; __entry->op = op; __entry->type = fl->fl_type; __entry->flags = fl->fl_flags; __entry->debug_id = fl->fl_u.afs.debug_id; ), TP_printk("%llx:%llx:%x %04x %s t=%s R=%llx/%llx f=%x", __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, __entry->debug_id, __print_symbolic(__entry->op, afs_flock_operations), __print_symbolic(__entry->type, afs_flock_types), __entry->from, __entry->len, __entry->flags) ); TRACE_EVENT(afs_reload_dir, TP_PROTO(struct afs_vnode *vnode), TP_ARGS(vnode), TP_STRUCT__entry( __field_struct(struct afs_fid, fid ) ), TP_fast_assign( __entry->fid = vnode->fid; ), TP_printk("%llx:%llx:%x", __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique) ); TRACE_EVENT(afs_silly_rename, TP_PROTO(struct afs_vnode *vnode, bool done), TP_ARGS(vnode, done), TP_STRUCT__entry( __field_struct(struct afs_fid, fid ) __field(bool, done ) ), TP_fast_assign( __entry->fid = vnode->fid; __entry->done = done; ), TP_printk("%llx:%llx:%x done=%u", __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, __entry->done) ); TRACE_EVENT(afs_get_tree, TP_PROTO(struct afs_cell *cell, struct afs_volume *volume), TP_ARGS(cell, volume), TP_STRUCT__entry( __field(u64, vid ) __array(char, cell, 24 ) __array(char, volume, 24 ) ), TP_fast_assign( int __len; __entry->vid = volume->vid; __len = min_t(int, cell->name_len, 23); memcpy(__entry->cell, cell->name, __len); __entry->cell[__len] = 0; __len = min_t(int, volume->name_len, 23); memcpy(__entry->volume, volume->name, __len); __entry->volume[__len] = 0; ), TP_printk("--- MOUNT %s:%s %llx", __entry->cell, __entry->volume, __entry->vid) ); TRACE_EVENT(afs_cb_break, TP_PROTO(struct afs_fid *fid, unsigned int cb_break, enum afs_cb_break_reason reason, bool skipped), TP_ARGS(fid, cb_break, reason, skipped), TP_STRUCT__entry( __field_struct(struct afs_fid, fid ) __field(unsigned int, cb_break ) __field(enum afs_cb_break_reason, reason ) __field(bool, skipped ) ), TP_fast_assign( __entry->fid = *fid; __entry->cb_break = cb_break; __entry->reason = reason; __entry->skipped = skipped; ), TP_printk("%llx:%llx:%x b=%x s=%u %s", __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, __entry->cb_break, __entry->skipped, __print_symbolic(__entry->reason, afs_cb_break_reasons)) ); TRACE_EVENT(afs_cb_miss, TP_PROTO(struct afs_fid *fid, enum afs_cb_break_reason reason), TP_ARGS(fid, reason), TP_STRUCT__entry( __field_struct(struct afs_fid, fid ) __field(enum afs_cb_break_reason, reason ) ), TP_fast_assign( __entry->fid = *fid; __entry->reason = reason; ), TP_printk(" %llx:%llx:%x %s", __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, __print_symbolic(__entry->reason, afs_cb_break_reasons)) ); TRACE_EVENT(afs_server, TP_PROTO(struct afs_server *server, int ref, int active, enum afs_server_trace reason), TP_ARGS(server, ref, active, reason), TP_STRUCT__entry( __field(unsigned int, server ) __field(int, ref ) __field(int, active ) __field(int, reason ) ), TP_fast_assign( __entry->server = server->debug_id; __entry->ref = ref; __entry->active = active; __entry->reason = reason; ), TP_printk("s=%08x %s u=%d a=%d", __entry->server, __print_symbolic(__entry->reason, afs_server_traces), __entry->ref, __entry->active) ); TRACE_EVENT(afs_volume, TP_PROTO(afs_volid_t vid, int ref, enum afs_volume_trace reason), TP_ARGS(vid, ref, reason), TP_STRUCT__entry( __field(afs_volid_t, vid ) __field(int, ref ) __field(enum afs_volume_trace, reason ) ), TP_fast_assign( __entry->vid = vid; __entry->ref = ref; __entry->reason = reason; ), TP_printk("V=%llx %s ur=%d", __entry->vid, __print_symbolic(__entry->reason, afs_volume_traces), __entry->ref) ); TRACE_EVENT(afs_cell, TP_PROTO(unsigned int cell_debug_id, int ref, int active, enum afs_cell_trace reason), TP_ARGS(cell_debug_id, ref, active, reason), TP_STRUCT__entry( __field(unsigned int, cell ) __field(int, ref ) __field(int, active ) __field(int, reason ) ), TP_fast_assign( __entry->cell = cell_debug_id; __entry->ref = ref; __entry->active = active; __entry->reason = reason; ), TP_printk("L=%08x %s r=%d a=%d", __entry->cell, __print_symbolic(__entry->reason, afs_cell_traces), __entry->ref, __entry->active) ); #endif /* _TRACE_AFS_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
50 50 50 50 50 50 50 50 50 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 | // SPDX-License-Identifier: GPL-2.0-only /* * net/sunrpc/cache.c * * Generic code for various authentication-related caches * used by sunrpc clients and servers. * * Copyright (C) 2002 Neil Brown <neilb@cse.unsw.edu.au> */ #include <linux/types.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/string_helpers.h> #include <linux/uaccess.h> #include <linux/poll.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/net.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <linux/pagemap.h> #include <asm/ioctls.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <trace/events/sunrpc.h> #include "netns.h" #define RPCDBG_FACILITY RPCDBG_CACHE static bool cache_defer_req(struct cache_req *req, struct cache_head *item); static void cache_revisit_request(struct cache_head *item); static void cache_init(struct cache_head *h, struct cache_detail *detail) { time64_t now = seconds_since_boot(); INIT_HLIST_NODE(&h->cache_list); h->flags = 0; kref_init(&h->ref); h->expiry_time = now + CACHE_NEW_EXPIRY; if (now <= detail->flush_time) /* ensure it isn't already expired */ now = detail->flush_time + 1; h->last_refresh = now; } static void cache_fresh_unlocked(struct cache_head *head, struct cache_detail *detail); static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail, struct cache_head *key, int hash) { struct hlist_head *head = &detail->hash_table[hash]; struct cache_head *tmp; rcu_read_lock(); hlist_for_each_entry_rcu(tmp, head, cache_list) { if (!detail->match(tmp, key)) continue; if (test_bit(CACHE_VALID, &tmp->flags) && cache_is_expired(detail, tmp)) continue; tmp = cache_get_rcu(tmp); rcu_read_unlock(); return tmp; } rcu_read_unlock(); return NULL; } static void sunrpc_begin_cache_remove_entry(struct cache_head *ch, struct cache_detail *cd) { /* Must be called under cd->hash_lock */ hlist_del_init_rcu(&ch->cache_list); set_bit(CACHE_CLEANED, &ch->flags); cd->entries --; } static void sunrpc_end_cache_remove_entry(struct cache_head *ch, struct cache_detail *cd) { cache_fresh_unlocked(ch, cd); cache_put(ch, cd); } static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, struct cache_head *key, int hash) { struct cache_head *new, *tmp, *freeme = NULL; struct hlist_head *head = &detail->hash_table[hash]; new = detail->alloc(); if (!new) return NULL; /* must fully initialise 'new', else * we might get lose if we need to * cache_put it soon. */ cache_init(new, detail); detail->init(new, key); spin_lock(&detail->hash_lock); /* check if entry appeared while we slept */ hlist_for_each_entry_rcu(tmp, head, cache_list, lockdep_is_held(&detail->hash_lock)) { if (!detail->match(tmp, key)) continue; if (test_bit(CACHE_VALID, &tmp->flags) && cache_is_expired(detail, tmp)) { sunrpc_begin_cache_remove_entry(tmp, detail); trace_cache_entry_expired(detail, tmp); freeme = tmp; break; } cache_get(tmp); spin_unlock(&detail->hash_lock); cache_put(new, detail); return tmp; } hlist_add_head_rcu(&new->cache_list, head); detail->entries++; cache_get(new); spin_unlock(&detail->hash_lock); if (freeme) sunrpc_end_cache_remove_entry(freeme, detail); return new; } struct cache_head *sunrpc_cache_lookup_rcu(struct cache_detail *detail, struct cache_head *key, int hash) { struct cache_head *ret; ret = sunrpc_cache_find_rcu(detail, key, hash); if (ret) return ret; /* Didn't find anything, insert an empty entry */ return sunrpc_cache_add_entry(detail, key, hash); } EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu); static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); static void cache_fresh_locked(struct cache_head *head, time64_t expiry, struct cache_detail *detail) { time64_t now = seconds_since_boot(); if (now <= detail->flush_time) /* ensure it isn't immediately treated as expired */ now = detail->flush_time + 1; head->expiry_time = expiry; head->last_refresh = now; smp_wmb(); /* paired with smp_rmb() in cache_is_valid() */ set_bit(CACHE_VALID, &head->flags); } static void cache_fresh_unlocked(struct cache_head *head, struct cache_detail *detail) { if (test_and_clear_bit(CACHE_PENDING, &head->flags)) { cache_revisit_request(head); cache_dequeue(detail, head); } } static void cache_make_negative(struct cache_detail *detail, struct cache_head *h) { set_bit(CACHE_NEGATIVE, &h->flags); trace_cache_entry_make_negative(detail, h); } static void cache_entry_update(struct cache_detail *detail, struct cache_head *h, struct cache_head *new) { if (!test_bit(CACHE_NEGATIVE, &new->flags)) { detail->update(h, new); trace_cache_entry_update(detail, h); } else { cache_make_negative(detail, h); } } struct cache_head *sunrpc_cache_update(struct cache_detail *detail, struct cache_head *new, struct cache_head *old, int hash) { /* The 'old' entry is to be replaced by 'new'. * If 'old' is not VALID, we update it directly, * otherwise we need to replace it */ struct cache_head *tmp; if (!test_bit(CACHE_VALID, &old->flags)) { spin_lock(&detail->hash_lock); if (!test_bit(CACHE_VALID, &old->flags)) { cache_entry_update(detail, old, new); cache_fresh_locked(old, new->expiry_time, detail); spin_unlock(&detail->hash_lock); cache_fresh_unlocked(old, detail); return old; } spin_unlock(&detail->hash_lock); } /* We need to insert a new entry */ tmp = detail->alloc(); if (!tmp) { cache_put(old, detail); return NULL; } cache_init(tmp, detail); detail->init(tmp, old); spin_lock(&detail->hash_lock); cache_entry_update(detail, tmp, new); hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); detail->entries++; cache_get(tmp); cache_fresh_locked(tmp, new->expiry_time, detail); cache_fresh_locked(old, 0, detail); spin_unlock(&detail->hash_lock); cache_fresh_unlocked(tmp, detail); cache_fresh_unlocked(old, detail); cache_put(old, detail); return tmp; } EXPORT_SYMBOL_GPL(sunrpc_cache_update); static inline int cache_is_valid(struct cache_head *h) { if (!test_bit(CACHE_VALID, &h->flags)) return -EAGAIN; else { /* entry is valid */ if (test_bit(CACHE_NEGATIVE, &h->flags)) return -ENOENT; else { /* * In combination with write barrier in * sunrpc_cache_update, ensures that anyone * using the cache entry after this sees the * updated contents: */ smp_rmb(); return 0; } } } static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h) { int rv; spin_lock(&detail->hash_lock); rv = cache_is_valid(h); if (rv == -EAGAIN) { cache_make_negative(detail, h); cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY, detail); rv = -ENOENT; } spin_unlock(&detail->hash_lock); cache_fresh_unlocked(h, detail); return rv; } /* * This is the generic cache management routine for all * the authentication caches. * It checks the currency of a cache item and will (later) * initiate an upcall to fill it if needed. * * * Returns 0 if the cache_head can be used, or cache_puts it and returns * -EAGAIN if upcall is pending and request has been queued * -ETIMEDOUT if upcall failed or request could not be queue or * upcall completed but item is still invalid (implying that * the cache item has been replaced with a newer one). * -ENOENT if cache entry was negative */ int cache_check(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp) { int rv; time64_t refresh_age, age; /* First decide return status as best we can */ rv = cache_is_valid(h); /* now see if we want to start an upcall */ refresh_age = (h->expiry_time - h->last_refresh); age = seconds_since_boot() - h->last_refresh; if (rqstp == NULL) { if (rv == -EAGAIN) rv = -ENOENT; } else if (rv == -EAGAIN || (h->expiry_time != 0 && age > refresh_age/2)) { dprintk("RPC: Want update, refage=%lld, age=%lld\n", refresh_age, age); switch (detail->cache_upcall(detail, h)) { case -EINVAL: rv = try_to_negate_entry(detail, h); break; case -EAGAIN: cache_fresh_unlocked(h, detail); break; } } if (rv == -EAGAIN) { if (!cache_defer_req(rqstp, h)) { /* * Request was not deferred; handle it as best * we can ourselves: */ rv = cache_is_valid(h); if (rv == -EAGAIN) rv = -ETIMEDOUT; } } if (rv) cache_put(h, detail); return rv; } EXPORT_SYMBOL_GPL(cache_check); /* * caches need to be periodically cleaned. * For this we maintain a list of cache_detail and * a current pointer into that list and into the table * for that entry. * * Each time cache_clean is called it finds the next non-empty entry * in the current table and walks the list in that entry * looking for entries that can be removed. * * An entry gets removed if: * - The expiry is before current time * - The last_refresh time is before the flush_time for that cache * * later we might drop old entries with non-NEVER expiry if that table * is getting 'full' for some definition of 'full' * * The question of "how often to scan a table" is an interesting one * and is answered in part by the use of the "nextcheck" field in the * cache_detail. * When a scan of a table begins, the nextcheck field is set to a time * that is well into the future. * While scanning, if an expiry time is found that is earlier than the * current nextcheck time, nextcheck is set to that expiry time. * If the flush_time is ever set to a time earlier than the nextcheck * time, the nextcheck time is then set to that flush_time. * * A table is then only scanned if the current time is at least * the nextcheck time. * */ static LIST_HEAD(cache_list); static DEFINE_SPINLOCK(cache_list_lock); static struct cache_detail *current_detail; static int current_index; static void do_cache_clean(struct work_struct *work); static struct delayed_work cache_cleaner; void sunrpc_init_cache_detail(struct cache_detail *cd) { spin_lock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; atomic_set(&cd->writers, 0); cd->last_close = 0; cd->last_warn = -1; list_add(&cd->others, &cache_list); spin_unlock(&cache_list_lock); /* start the cleaning process */ queue_delayed_work(system_power_efficient_wq, &cache_cleaner, 0); } EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail); void sunrpc_destroy_cache_detail(struct cache_detail *cd) { cache_purge(cd); spin_lock(&cache_list_lock); spin_lock(&cd->hash_lock); if (current_detail == cd) current_detail = NULL; list_del_init(&cd->others); spin_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); if (list_empty(&cache_list)) { /* module must be being unloaded so its safe to kill the worker */ cancel_delayed_work_sync(&cache_cleaner); } } EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail); /* clean cache tries to find something to clean * and cleans it. * It returns 1 if it cleaned something, * 0 if it didn't find anything this time * -1 if it fell off the end of the list. */ static int cache_clean(void) { int rv = 0; struct list_head *next; spin_lock(&cache_list_lock); /* find a suitable table if we don't already have one */ while (current_detail == NULL || current_index >= current_detail->hash_size) { if (current_detail) next = current_detail->others.next; else next = cache_list.next; if (next == &cache_list) { current_detail = NULL; spin_unlock(&cache_list_lock); return -1; } current_detail = list_entry(next, struct cache_detail, others); if (current_detail->nextcheck > seconds_since_boot()) current_index = current_detail->hash_size; else { current_index = 0; current_detail->nextcheck = seconds_since_boot()+30*60; } } /* find a non-empty bucket in the table */ while (current_detail && current_index < current_detail->hash_size && hlist_empty(¤t_detail->hash_table[current_index])) current_index++; /* find a cleanable entry in the bucket and clean it, or set to next bucket */ if (current_detail && current_index < current_detail->hash_size) { struct cache_head *ch = NULL; struct cache_detail *d; struct hlist_head *head; struct hlist_node *tmp; spin_lock(¤t_detail->hash_lock); /* Ok, now to clean this strand */ head = ¤t_detail->hash_table[current_index]; hlist_for_each_entry_safe(ch, tmp, head, cache_list) { if (current_detail->nextcheck > ch->expiry_time) current_detail->nextcheck = ch->expiry_time+1; if (!cache_is_expired(current_detail, ch)) continue; sunrpc_begin_cache_remove_entry(ch, current_detail); trace_cache_entry_expired(current_detail, ch); rv = 1; break; } spin_unlock(¤t_detail->hash_lock); d = current_detail; if (!ch) current_index ++; spin_unlock(&cache_list_lock); if (ch) sunrpc_end_cache_remove_entry(ch, d); } else spin_unlock(&cache_list_lock); return rv; } /* * We want to regularly clean the cache, so we need to schedule some work ... */ static void do_cache_clean(struct work_struct *work) { int delay; if (list_empty(&cache_list)) return; if (cache_clean() == -1) delay = round_jiffies_relative(30*HZ); else delay = 5; queue_delayed_work(system_power_efficient_wq, &cache_cleaner, delay); } /* * Clean all caches promptly. This just calls cache_clean * repeatedly until we are sure that every cache has had a chance to * be fully cleaned */ void cache_flush(void) { while (cache_clean() != -1) cond_resched(); while (cache_clean() != -1) cond_resched(); } EXPORT_SYMBOL_GPL(cache_flush); void cache_purge(struct cache_detail *detail) { struct cache_head *ch = NULL; struct hlist_head *head = NULL; int i = 0; spin_lock(&detail->hash_lock); if (!detail->entries) { spin_unlock(&detail->hash_lock); return; } dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name); for (i = 0; i < detail->hash_size; i++) { head = &detail->hash_table[i]; while (!hlist_empty(head)) { ch = hlist_entry(head->first, struct cache_head, cache_list); sunrpc_begin_cache_remove_entry(ch, detail); spin_unlock(&detail->hash_lock); sunrpc_end_cache_remove_entry(ch, detail); spin_lock(&detail->hash_lock); } } spin_unlock(&detail->hash_lock); } EXPORT_SYMBOL_GPL(cache_purge); /* * Deferral and Revisiting of Requests. * * If a cache lookup finds a pending entry, we * need to defer the request and revisit it later. * All deferred requests are stored in a hash table, * indexed by "struct cache_head *". * As it may be wasteful to store a whole request * structure, we allow the request to provide a * deferred form, which must contain a * 'struct cache_deferred_req' * This cache_deferred_req contains a method to allow * it to be revisited when cache info is available */ #define DFR_HASHSIZE (PAGE_SIZE/sizeof(struct list_head)) #define DFR_HASH(item) ((((long)item)>>4 ^ (((long)item)>>13)) % DFR_HASHSIZE) #define DFR_MAX 300 /* ??? */ static DEFINE_SPINLOCK(cache_defer_lock); static LIST_HEAD(cache_defer_list); static struct hlist_head cache_defer_hash[DFR_HASHSIZE]; static int cache_defer_cnt; static void __unhash_deferred_req(struct cache_deferred_req *dreq) { hlist_del_init(&dreq->hash); if (!list_empty(&dreq->recent)) { list_del_init(&dreq->recent); cache_defer_cnt--; } } static void __hash_deferred_req(struct cache_deferred_req *dreq, struct cache_head *item) { int hash = DFR_HASH(item); INIT_LIST_HEAD(&dreq->recent); hlist_add_head(&dreq->hash, &cache_defer_hash[hash]); } static void setup_deferral(struct cache_deferred_req *dreq, struct cache_head *item, int count_me) { dreq->item = item; spin_lock(&cache_defer_lock); __hash_deferred_req(dreq, item); if (count_me) { cache_defer_cnt++; list_add(&dreq->recent, &cache_defer_list); } spin_unlock(&cache_defer_lock); } struct thread_deferred_req { struct cache_deferred_req handle; struct completion completion; }; static void cache_restart_thread(struct cache_deferred_req *dreq, int too_many) { struct thread_deferred_req *dr = container_of(dreq, struct thread_deferred_req, handle); complete(&dr->completion); } static void cache_wait_req(struct cache_req *req, struct cache_head *item) { struct thread_deferred_req sleeper; struct cache_deferred_req *dreq = &sleeper.handle; sleeper.completion = COMPLETION_INITIALIZER_ONSTACK(sleeper.completion); dreq->revisit = cache_restart_thread; setup_deferral(dreq, item, 0); if (!test_bit(CACHE_PENDING, &item->flags) || wait_for_completion_interruptible_timeout( &sleeper.completion, req->thread_wait) <= 0) { /* The completion wasn't completed, so we need * to clean up */ spin_lock(&cache_defer_lock); if (!hlist_unhashed(&sleeper.handle.hash)) { __unhash_deferred_req(&sleeper.handle); spin_unlock(&cache_defer_lock); } else { /* cache_revisit_request already removed * this from the hash table, but hasn't * called ->revisit yet. It will very soon * and we need to wait for it. */ spin_unlock(&cache_defer_lock); wait_for_completion(&sleeper.completion); } } } static void cache_limit_defers(void) { /* Make sure we haven't exceed the limit of allowed deferred * requests. */ struct cache_deferred_req *discard = NULL; if (cache_defer_cnt <= DFR_MAX) return; spin_lock(&cache_defer_lock); /* Consider removing either the first or the last */ if (cache_defer_cnt > DFR_MAX) { if (prandom_u32() & 1) discard = list_entry(cache_defer_list.next, struct cache_deferred_req, recent); else discard = list_entry(cache_defer_list.prev, struct cache_deferred_req, recent); __unhash_deferred_req(discard); } spin_unlock(&cache_defer_lock); if (discard) discard->revisit(discard, 1); } /* Return true if and only if a deferred request is queued. */ static bool cache_defer_req(struct cache_req *req, struct cache_head *item) { struct cache_deferred_req *dreq; if (req->thread_wait) { cache_wait_req(req, item); if (!test_bit(CACHE_PENDING, &item->flags)) return false; } dreq = req->defer(req); if (dreq == NULL) return false; setup_deferral(dreq, item, 1); if (!test_bit(CACHE_PENDING, &item->flags)) /* Bit could have been cleared before we managed to * set up the deferral, so need to revisit just in case */ cache_revisit_request(item); cache_limit_defers(); return true; } static void cache_revisit_request(struct cache_head *item) { struct cache_deferred_req *dreq; struct list_head pending; struct hlist_node *tmp; int hash = DFR_HASH(item); INIT_LIST_HEAD(&pending); spin_lock(&cache_defer_lock); hlist_for_each_entry_safe(dreq, tmp, &cache_defer_hash[hash], hash) if (dreq->item == item) { __unhash_deferred_req(dreq); list_add(&dreq->recent, &pending); } spin_unlock(&cache_defer_lock); while (!list_empty(&pending)) { dreq = list_entry(pending.next, struct cache_deferred_req, recent); list_del_init(&dreq->recent); dreq->revisit(dreq, 0); } } void cache_clean_deferred(void *owner) { struct cache_deferred_req *dreq, *tmp; struct list_head pending; INIT_LIST_HEAD(&pending); spin_lock(&cache_defer_lock); list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) { if (dreq->owner == owner) { __unhash_deferred_req(dreq); list_add(&dreq->recent, &pending); } } spin_unlock(&cache_defer_lock); while (!list_empty(&pending)) { dreq = list_entry(pending.next, struct cache_deferred_req, recent); list_del_init(&dreq->recent); dreq->revisit(dreq, 1); } } /* * communicate with user-space * * We have a magic /proc file - /proc/net/rpc/<cachename>/channel. * On read, you get a full request, or block. * On write, an update request is processed. * Poll works if anything to read, and always allows write. * * Implemented by linked list of requests. Each open file has * a ->private that also exists in this list. New requests are added * to the end and may wakeup and preceding readers. * New readers are added to the head. If, on read, an item is found with * CACHE_UPCALLING clear, we free it from the list. * */ static DEFINE_SPINLOCK(queue_lock); struct cache_queue { struct list_head list; int reader; /* if 0, then request */ }; struct cache_request { struct cache_queue q; struct cache_head *item; char * buf; int len; int readers; }; struct cache_reader { struct cache_queue q; int offset; /* if non-0, we have a refcnt on next request */ }; static int cache_request(struct cache_detail *detail, struct cache_request *crq) { char *bp = crq->buf; int len = PAGE_SIZE; detail->cache_request(detail, crq->item, &bp, &len); if (len < 0) return -E2BIG; return PAGE_SIZE - len; } static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos, struct cache_detail *cd) { struct cache_reader *rp = filp->private_data; struct cache_request *rq; struct inode *inode = file_inode(filp); int err; if (count == 0) return 0; inode_lock(inode); /* protect against multiple concurrent * readers on this file */ again: spin_lock(&queue_lock); /* need to find next request */ while (rp->q.list.next != &cd->queue && list_entry(rp->q.list.next, struct cache_queue, list) ->reader) { struct list_head *next = rp->q.list.next; list_move(&rp->q.list, next); } if (rp->q.list.next == &cd->queue) { spin_unlock(&queue_lock); inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; } rq = container_of(rp->q.list.next, struct cache_request, q.list); WARN_ON_ONCE(rq->q.reader); if (rp->offset == 0) rq->readers++; spin_unlock(&queue_lock); if (rq->len == 0) { err = cache_request(cd, rq); if (err < 0) goto out; rq->len = err; } if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { err = -EAGAIN; spin_lock(&queue_lock); list_move(&rp->q.list, &rq->q.list); spin_unlock(&queue_lock); } else { if (rp->offset + count > rq->len) count = rq->len - rp->offset; err = -EFAULT; if (copy_to_user(buf, rq->buf + rp->offset, count)) goto out; rp->offset += count; if (rp->offset >= rq->len) { rp->offset = 0; spin_lock(&queue_lock); list_move(&rp->q.list, &rq->q.list); spin_unlock(&queue_lock); } err = 0; } out: if (rp->offset == 0) { /* need to release rq */ spin_lock(&queue_lock); rq->readers--; if (rq->readers == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { list_del(&rq->q.list); spin_unlock(&queue_lock); cache_put(rq->item, cd); kfree(rq->buf); kfree(rq); } else spin_unlock(&queue_lock); } if (err == -EAGAIN) goto again; inode_unlock(inode); return err ? err : count; } static ssize_t cache_do_downcall(char *kaddr, const char __user *buf, size_t count, struct cache_detail *cd) { ssize_t ret; if (count == 0) return -EINVAL; if (copy_from_user(kaddr, buf, count)) return -EFAULT; kaddr[count] = '\0'; ret = cd->cache_parse(cd, kaddr, count); if (!ret) ret = count; return ret; } static ssize_t cache_downcall(struct address_space *mapping, const char __user *buf, size_t count, struct cache_detail *cd) { char *write_buf; ssize_t ret = -ENOMEM; if (count >= 32768) { /* 32k is max userland buffer, lets check anyway */ ret = -EINVAL; goto out; } write_buf = kvmalloc(count + 1, GFP_KERNEL); if (!write_buf) goto out; ret = cache_do_downcall(write_buf, buf, count, cd); kvfree(write_buf); out: return ret; } static ssize_t cache_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos, struct cache_detail *cd) { struct address_space *mapping = filp->f_mapping; struct inode *inode = file_inode(filp); ssize_t ret = -EINVAL; if (!cd->cache_parse) goto out; inode_lock(inode); ret = cache_downcall(mapping, buf, count, cd); inode_unlock(inode); out: return ret; } static DECLARE_WAIT_QUEUE_HEAD(queue_wait); static __poll_t cache_poll(struct file *filp, poll_table *wait, struct cache_detail *cd) { __poll_t mask; struct cache_reader *rp = filp->private_data; struct cache_queue *cq; poll_wait(filp, &queue_wait, wait); /* alway allow write */ mask = EPOLLOUT | EPOLLWRNORM; if (!rp) return mask; spin_lock(&queue_lock); for (cq= &rp->q; &cq->list != &cd->queue; cq = list_entry(cq->list.next, struct cache_queue, list)) if (!cq->reader) { mask |= EPOLLIN | EPOLLRDNORM; break; } spin_unlock(&queue_lock); return mask; } static int cache_ioctl(struct inode *ino, struct file *filp, unsigned int cmd, unsigned long arg, struct cache_detail *cd) { int len = 0; struct cache_reader *rp = filp->private_data; struct cache_queue *cq; if (cmd != FIONREAD || !rp) return -EINVAL; spin_lock(&queue_lock); /* only find the length remaining in current request, * or the length of the next request */ for (cq= &rp->q; &cq->list != &cd->queue; cq = list_entry(cq->list.next, struct cache_queue, list)) if (!cq->reader) { struct cache_request *cr = container_of(cq, struct cache_request, q); len = cr->len - rp->offset; break; } spin_unlock(&queue_lock); return put_user(len, (int __user *)arg); } static int cache_open(struct inode *inode, struct file *filp, struct cache_detail *cd) { struct cache_reader *rp = NULL; if (!cd || !try_module_get(cd->owner)) return -EACCES; nonseekable_open(inode, filp); if (filp->f_mode & FMODE_READ) { rp = kmalloc(sizeof(*rp), GFP_KERNEL); if (!rp) { module_put(cd->owner); return -ENOMEM; } rp->offset = 0; rp->q.reader = 1; spin_lock(&queue_lock); list_add(&rp->q.list, &cd->queue); spin_unlock(&queue_lock); } if (filp->f_mode & FMODE_WRITE) atomic_inc(&cd->writers); filp->private_data = rp; return 0; } static int cache_release(struct inode *inode, struct file *filp, struct cache_detail *cd) { struct cache_reader *rp = filp->private_data; if (rp) { spin_lock(&queue_lock); if (rp->offset) { struct cache_queue *cq; for (cq= &rp->q; &cq->list != &cd->queue; cq = list_entry(cq->list.next, struct cache_queue, list)) if (!cq->reader) { container_of(cq, struct cache_request, q) ->readers--; break; } rp->offset = 0; } list_del(&rp->q.list); spin_unlock(&queue_lock); filp->private_data = NULL; kfree(rp); } if (filp->f_mode & FMODE_WRITE) { atomic_dec(&cd->writers); cd->last_close = seconds_since_boot(); } module_put(cd->owner); return 0; } static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) { struct cache_queue *cq, *tmp; struct cache_request *cr; struct list_head dequeued; INIT_LIST_HEAD(&dequeued); spin_lock(&queue_lock); list_for_each_entry_safe(cq, tmp, &detail->queue, list) if (!cq->reader) { cr = container_of(cq, struct cache_request, q); if (cr->item != ch) continue; if (test_bit(CACHE_PENDING, &ch->flags)) /* Lost a race and it is pending again */ break; if (cr->readers != 0) continue; list_move(&cr->q.list, &dequeued); } spin_unlock(&queue_lock); while (!list_empty(&dequeued)) { cr = list_entry(dequeued.next, struct cache_request, q.list); list_del(&cr->q.list); cache_put(cr->item, detail); kfree(cr->buf); kfree(cr); } } /* * Support routines for text-based upcalls. * Fields are separated by spaces. * Fields are either mangled to quote space tab newline slosh with slosh * or a hexified with a leading \x * Record is terminated with newline. * */ void qword_add(char **bpp, int *lp, char *str) { char *bp = *bpp; int len = *lp; int ret; if (len < 0) return; ret = string_escape_str(str, bp, len, ESCAPE_OCTAL, "\\ \n\t"); if (ret >= len) { bp += len; len = -1; } else { bp += ret; len -= ret; *bp++ = ' '; len--; } *bpp = bp; *lp = len; } EXPORT_SYMBOL_GPL(qword_add); void qword_addhex(char **bpp, int *lp, char *buf, int blen) { char *bp = *bpp; int len = *lp; if (len < 0) return; if (len > 2) { *bp++ = '\\'; *bp++ = 'x'; len -= 2; while (blen && len >= 2) { bp = hex_byte_pack(bp, *buf++); len -= 2; blen--; } } if (blen || len<1) len = -1; else { *bp++ = ' '; len--; } *bpp = bp; *lp = len; } EXPORT_SYMBOL_GPL(qword_addhex); static void warn_no_listener(struct cache_detail *detail) { if (detail->last_warn != detail->last_close) { detail->last_warn = detail->last_close; if (detail->warn_no_listener) detail->warn_no_listener(detail, detail->last_close != 0); } } static bool cache_listeners_exist(struct cache_detail *detail) { if (atomic_read(&detail->writers)) return true; if (detail->last_close == 0) /* This cache was never opened */ return false; if (detail->last_close < seconds_since_boot() - 30) /* * We allow for the possibility that someone might * restart a userspace daemon without restarting the * server; but after 30 seconds, we give up. */ return false; return true; } /* * register an upcall request to user-space and queue it up for read() by the * upcall daemon. * * Each request is at most one page long. */ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) { char *buf; struct cache_request *crq; int ret = 0; if (test_bit(CACHE_CLEANED, &h->flags)) /* Too late to make an upcall */ return -EAGAIN; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) return -EAGAIN; crq = kmalloc(sizeof (*crq), GFP_KERNEL); if (!crq) { kfree(buf); return -EAGAIN; } crq->q.reader = 0; crq->buf = buf; crq->len = 0; crq->readers = 0; spin_lock(&queue_lock); if (test_bit(CACHE_PENDING, &h->flags)) { crq->item = cache_get(h); list_add_tail(&crq->q.list, &detail->queue); trace_cache_entry_upcall(detail, h); } else /* Lost a race, no longer PENDING, so don't enqueue */ ret = -EAGAIN; spin_unlock(&queue_lock); wake_up(&queue_wait); if (ret == -EAGAIN) { kfree(buf); kfree(crq); } return ret; } int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) { if (test_and_set_bit(CACHE_PENDING, &h->flags)) return 0; return cache_pipe_upcall(detail, h); } EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall); int sunrpc_cache_pipe_upcall_timeout(struct cache_detail *detail, struct cache_head *h) { if (!cache_listeners_exist(detail)) { warn_no_listener(detail); trace_cache_entry_no_listener(detail, h); return -EINVAL; } return sunrpc_cache_pipe_upcall(detail, h); } EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall_timeout); /* * parse a message from user-space and pass it * to an appropriate cache * Messages are, like requests, separated into fields by * spaces and dequotes as \xHEXSTRING or embedded \nnn octal * * Message is * reply cachename expiry key ... content.... * * key and content are both parsed by cache */ int qword_get(char **bpp, char *dest, int bufsize) { /* return bytes copied, or -1 on error */ char *bp = *bpp; int len = 0; while (*bp == ' ') bp++; if (bp[0] == '\\' && bp[1] == 'x') { /* HEX STRING */ bp += 2; while (len < bufsize - 1) { int h, l; h = hex_to_bin(bp[0]); if (h < 0) break; l = hex_to_bin(bp[1]); if (l < 0) break; *dest++ = (h << 4) | l; bp += 2; len++; } } else { /* text with \nnn octal quoting */ while (*bp != ' ' && *bp != '\n' && *bp && len < bufsize-1) { if (*bp == '\\' && isodigit(bp[1]) && (bp[1] <= '3') && isodigit(bp[2]) && isodigit(bp[3])) { int byte = (*++bp -'0'); bp++; byte = (byte << 3) | (*bp++ - '0'); byte = (byte << 3) | (*bp++ - '0'); *dest++ = byte; len++; } else { *dest++ = *bp++; len++; } } } if (*bp != ' ' && *bp != '\n' && *bp != '\0') return -1; while (*bp == ' ') bp++; *bpp = bp; *dest = '\0'; return len; } EXPORT_SYMBOL_GPL(qword_get); /* * support /proc/net/rpc/$CACHENAME/content * as a seqfile. * We call ->cache_show passing NULL for the item to * get a header, then pass each real item in the cache */ static void *__cache_seq_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; unsigned int hash, entry; struct cache_head *ch; struct cache_detail *cd = m->private; if (!n--) return SEQ_START_TOKEN; hash = n >> 32; entry = n & ((1LL<<32) - 1); hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; n &= ~((1LL<<32) - 1); do { hash++; n += 1LL<<32; } while(hash < cd->hash_size && hlist_empty(&cd->hash_table[hash])); if (hash >= cd->hash_size) return NULL; *pos = n+1; return hlist_entry_safe(rcu_dereference_raw( hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); } static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) { struct cache_head *ch = p; int hash = (*pos >> 32); struct cache_detail *cd = m->private; if (p == SEQ_START_TOKEN) hash = 0; else if (ch->cache_list.next == NULL) { hash++; *pos += 1LL<<32; } else { ++*pos; return hlist_entry_safe(rcu_dereference_raw( hlist_next_rcu(&ch->cache_list)), struct cache_head, cache_list); } *pos &= ~((1LL<<32) - 1); while (hash < cd->hash_size && hlist_empty(&cd->hash_table[hash])) { hash++; *pos += 1LL<<32; } if (hash >= cd->hash_size) return NULL; ++*pos; return hlist_entry_safe(rcu_dereference_raw( hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); } void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return __cache_seq_start(m, pos); } EXPORT_SYMBOL_GPL(cache_seq_start_rcu); void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos) { return cache_seq_next(file, p, pos); } EXPORT_SYMBOL_GPL(cache_seq_next_rcu); void cache_seq_stop_rcu(struct seq_file *m, void *p) __releases(RCU) { rcu_read_unlock(); } EXPORT_SYMBOL_GPL(cache_seq_stop_rcu); static int c_show(struct seq_file *m, void *p) { struct cache_head *cp = p; struct cache_detail *cd = m->private; if (p == SEQ_START_TOKEN) return cd->cache_show(m, cd, NULL); ifdebug(CACHE) seq_printf(m, "# expiry=%lld refcnt=%d flags=%lx\n", convert_to_wallclock(cp->expiry_time), kref_read(&cp->ref), cp->flags); if (!cache_get_rcu(cp)) return 0; if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ seq_puts(m, "# "); else { if (cache_is_expired(cd, cp)) seq_puts(m, "# "); cache_put(cp, cd); } return cd->cache_show(m, cd, cp); } static const struct seq_operations cache_content_op = { .start = cache_seq_start_rcu, .next = cache_seq_next_rcu, .stop = cache_seq_stop_rcu, .show = c_show, }; static int content_open(struct inode *inode, struct file *file, struct cache_detail *cd) { struct seq_file *seq; int err; if (!cd || !try_module_get(cd->owner)) return -EACCES; err = seq_open(file, &cache_content_op); if (err) { module_put(cd->owner); return err; } seq = file->private_data; seq->private = cd; return 0; } static int content_release(struct inode *inode, struct file *file, struct cache_detail *cd) { int ret = seq_release(inode, file); module_put(cd->owner); return ret; } static int open_flush(struct inode *inode, struct file *file, struct cache_detail *cd) { if (!cd || !try_module_get(cd->owner)) return -EACCES; return nonseekable_open(inode, file); } static int release_flush(struct inode *inode, struct file *file, struct cache_detail *cd) { module_put(cd->owner); return 0; } static ssize_t read_flush(struct file *file, char __user *buf, size_t count, loff_t *ppos, struct cache_detail *cd) { char tbuf[22]; size_t len; len = snprintf(tbuf, sizeof(tbuf), "%llu\n", convert_to_wallclock(cd->flush_time)); return simple_read_from_buffer(buf, count, ppos, tbuf, len); } static ssize_t write_flush(struct file *file, const char __user *buf, size_t count, loff_t *ppos, struct cache_detail *cd) { char tbuf[20]; char *ep; time64_t now; if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; if (copy_from_user(tbuf, buf, count)) return -EFAULT; tbuf[count] = 0; simple_strtoul(tbuf, &ep, 0); if (*ep && *ep != '\n') return -EINVAL; /* Note that while we check that 'buf' holds a valid number, * we always ignore the value and just flush everything. * Making use of the number leads to races. */ now = seconds_since_boot(); /* Always flush everything, so behave like cache_purge() * Do this by advancing flush_time to the current time, * or by one second if it has already reached the current time. * Newly added cache entries will always have ->last_refresh greater * that ->flush_time, so they don't get flushed prematurely. */ if (cd->flush_time >= now) now = cd->flush_time + 1; cd->flush_time = now; cd->nextcheck = now; cache_flush(); if (cd->flush) cd->flush(); *ppos += count; return count; } static ssize_t cache_read_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct cache_detail *cd = PDE_DATA(file_inode(filp)); return cache_read(filp, buf, count, ppos, cd); } static ssize_t cache_write_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { struct cache_detail *cd = PDE_DATA(file_inode(filp)); return cache_write(filp, buf, count, ppos, cd); } static __poll_t cache_poll_procfs(struct file *filp, poll_table *wait) { struct cache_detail *cd = PDE_DATA(file_inode(filp)); return cache_poll(filp, wait, cd); } static long cache_ioctl_procfs(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct cache_detail *cd = PDE_DATA(inode); return cache_ioctl(inode, filp, cmd, arg, cd); } static int cache_open_procfs(struct inode *inode, struct file *filp) { struct cache_detail *cd = PDE_DATA(inode); return cache_open(inode, filp, cd); } static int cache_release_procfs(struct inode *inode, struct file *filp) { struct cache_detail *cd = PDE_DATA(inode); return cache_release(inode, filp, cd); } static const struct proc_ops cache_channel_proc_ops = { .proc_lseek = no_llseek, .proc_read = cache_read_procfs, .proc_write = cache_write_procfs, .proc_poll = cache_poll_procfs, .proc_ioctl = cache_ioctl_procfs, /* for FIONREAD */ .proc_open = cache_open_procfs, .proc_release = cache_release_procfs, }; static int content_open_procfs(struct inode *inode, struct file *filp) { struct cache_detail *cd = PDE_DATA(inode); return content_open(inode, filp, cd); } static int content_release_procfs(struct inode *inode, struct file *filp) { struct cache_detail *cd = PDE_DATA(inode); return content_release(inode, filp, cd); } static const struct proc_ops content_proc_ops = { .proc_open = content_open_procfs, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = content_release_procfs, }; static int open_flush_procfs(struct inode *inode, struct file *filp) { struct cache_detail *cd = PDE_DATA(inode); return open_flush(inode, filp, cd); } static int release_flush_procfs(struct inode *inode, struct file *filp) { struct cache_detail *cd = PDE_DATA(inode); return release_flush(inode, filp, cd); } static ssize_t read_flush_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct cache_detail *cd = PDE_DATA(file_inode(filp)); return read_flush(filp, buf, count, ppos, cd); } static ssize_t write_flush_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { struct cache_detail *cd = PDE_DATA(file_inode(filp)); return write_flush(filp, buf, count, ppos, cd); } static const struct proc_ops cache_flush_proc_ops = { .proc_open = open_flush_procfs, .proc_read = read_flush_procfs, .proc_write = write_flush_procfs, .proc_release = release_flush_procfs, .proc_lseek = no_llseek, }; static void remove_cache_proc_entries(struct cache_detail *cd) { if (cd->procfs) { proc_remove(cd->procfs); cd->procfs = NULL; } } static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) { struct proc_dir_entry *p; struct sunrpc_net *sn; if (!IS_ENABLED(CONFIG_PROC_FS)) return 0; sn = net_generic(net, sunrpc_net_id); cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc); if (cd->procfs == NULL) goto out_nomem; p = proc_create_data("flush", S_IFREG | 0600, cd->procfs, &cache_flush_proc_ops, cd); if (p == NULL) goto out_nomem; if (cd->cache_request || cd->cache_parse) { p = proc_create_data("channel", S_IFREG | 0600, cd->procfs, &cache_channel_proc_ops, cd); if (p == NULL) goto out_nomem; } if (cd->cache_show) { p = proc_create_data("content", S_IFREG | 0400, cd->procfs, &content_proc_ops, cd); if (p == NULL) goto out_nomem; } return 0; out_nomem: remove_cache_proc_entries(cd); return -ENOMEM; } void __init cache_initialize(void) { INIT_DEFERRABLE_WORK(&cache_cleaner, do_cache_clean); } int cache_register_net(struct cache_detail *cd, struct net *net) { int ret; sunrpc_init_cache_detail(cd); ret = create_cache_proc_entries(cd, net); if (ret) sunrpc_destroy_cache_detail(cd); return ret; } EXPORT_SYMBOL_GPL(cache_register_net); void cache_unregister_net(struct cache_detail *cd, struct net *net) { remove_cache_proc_entries(cd); sunrpc_destroy_cache_detail(cd); } EXPORT_SYMBOL_GPL(cache_unregister_net); struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct net *net) { struct cache_detail *cd; int i; cd = kmemdup(tmpl, sizeof(struct cache_detail), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); cd->hash_table = kcalloc(cd->hash_size, sizeof(struct hlist_head), GFP_KERNEL); if (cd->hash_table == NULL) { kfree(cd); return ERR_PTR(-ENOMEM); } for (i = 0; i < cd->hash_size; i++) INIT_HLIST_HEAD(&cd->hash_table[i]); cd->net = net; return cd; } EXPORT_SYMBOL_GPL(cache_create_net); void cache_destroy_net(struct cache_detail *cd, struct net *net) { kfree(cd->hash_table); kfree(cd); } EXPORT_SYMBOL_GPL(cache_destroy_net); static ssize_t cache_read_pipefs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct cache_detail *cd = RPC_I(file_inode(filp))->private; return cache_read(filp, buf, count, ppos, cd); } static ssize_t cache_write_pipefs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { struct cache_detail *cd = RPC_I(file_inode(filp))->private; return cache_write(filp, buf, count, ppos, cd); } static __poll_t cache_poll_pipefs(struct file *filp, poll_table *wait) { struct cache_detail *cd = RPC_I(file_inode(filp))->private; return cache_poll(filp, wait, cd); } static long cache_ioctl_pipefs(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct cache_detail *cd = RPC_I(inode)->private; return cache_ioctl(inode, filp, cmd, arg, cd); } static int cache_open_pipefs(struct inode *inode, struct file *filp) { struct cache_detail *cd = RPC_I(inode)->private; return cache_open(inode, filp, cd); } static int cache_release_pipefs(struct inode *inode, struct file *filp) { struct cache_detail *cd = RPC_I(inode)->private; return cache_release(inode, filp, cd); } const struct file_operations cache_file_operations_pipefs = { .owner = THIS_MODULE, .llseek = no_llseek, .read = cache_read_pipefs, .write = cache_write_pipefs, .poll = cache_poll_pipefs, .unlocked_ioctl = cache_ioctl_pipefs, /* for FIONREAD */ .open = cache_open_pipefs, .release = cache_release_pipefs, }; static int content_open_pipefs(struct inode *inode, struct file *filp) { struct cache_detail *cd = RPC_I(inode)->private; return content_open(inode, filp, cd); } static int content_release_pipefs(struct inode *inode, struct file *filp) { struct cache_detail *cd = RPC_I(inode)->private; return content_release(inode, filp, cd); } const struct file_operations content_file_operations_pipefs = { .open = content_open_pipefs, .read = seq_read, .llseek = seq_lseek, .release = content_release_pipefs, }; static int open_flush_pipefs(struct inode *inode, struct file *filp) { struct cache_detail *cd = RPC_I(inode)->private; return open_flush(inode, filp, cd); } static int release_flush_pipefs(struct inode *inode, struct file *filp) { struct cache_detail *cd = RPC_I(inode)->private; return release_flush(inode, filp, cd); } static ssize_t read_flush_pipefs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct cache_detail *cd = RPC_I(file_inode(filp))->private; return read_flush(filp, buf, count, ppos, cd); } static ssize_t write_flush_pipefs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { struct cache_detail *cd = RPC_I(file_inode(filp))->private; return write_flush(filp, buf, count, ppos, cd); } const struct file_operations cache_flush_operations_pipefs = { .open = open_flush_pipefs, .read = read_flush_pipefs, .write = write_flush_pipefs, .release = release_flush_pipefs, .llseek = no_llseek, }; int sunrpc_cache_register_pipefs(struct dentry *parent, const char *name, umode_t umode, struct cache_detail *cd) { struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd); if (IS_ERR(dir)) return PTR_ERR(dir); cd->pipefs = dir; return 0; } EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs); void sunrpc_cache_unregister_pipefs(struct cache_detail *cd) { if (cd->pipefs) { rpc_remove_cache_dir(cd->pipefs); cd->pipefs = NULL; } } EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h) { spin_lock(&cd->hash_lock); if (!hlist_unhashed(&h->cache_list)){ sunrpc_begin_cache_remove_entry(h, cd); spin_unlock(&cd->hash_lock); sunrpc_end_cache_remove_entry(h, cd); } else spin_unlock(&cd->hash_lock); } EXPORT_SYMBOL_GPL(sunrpc_cache_unhash); |
22 13 13 22 22 22 22 22 22 22 22 212 160 95 190 105 48 68 116 14 105 23 216 123 217 13 13 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/inode.c - kernfs inode implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/security.h> #include "kernfs-internal.h" static const struct inode_operations kernfs_iops = { .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, }; static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc) { static DEFINE_MUTEX(iattr_mutex); struct kernfs_iattrs *ret; mutex_lock(&iattr_mutex); if (kn->iattr || !alloc) goto out_unlock; kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL); if (!kn->iattr) goto out_unlock; /* assign default attributes */ kn->iattr->ia_uid = GLOBAL_ROOT_UID; kn->iattr->ia_gid = GLOBAL_ROOT_GID; ktime_get_real_ts64(&kn->iattr->ia_atime); kn->iattr->ia_mtime = kn->iattr->ia_atime; kn->iattr->ia_ctime = kn->iattr->ia_atime; simple_xattrs_init(&kn->iattr->xattrs); atomic_set(&kn->iattr->nr_user_xattrs, 0); atomic_set(&kn->iattr->user_xattr_size, 0); out_unlock: ret = kn->iattr; mutex_unlock(&iattr_mutex); return ret; } static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { return __kernfs_iattrs(kn, 1); } static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn) { return __kernfs_iattrs(kn, 0); } int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { struct kernfs_iattrs *attrs; unsigned int ia_valid = iattr->ia_valid; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; if (ia_valid & ATTR_UID) attrs->ia_uid = iattr->ia_uid; if (ia_valid & ATTR_GID) attrs->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) attrs->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) attrs->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) attrs->ia_ctime = iattr->ia_ctime; if (ia_valid & ATTR_MODE) kn->mode = iattr->ia_mode; return 0; } /** * kernfs_setattr - set iattr on a node * @kn: target node * @iattr: iattr to set * * Returns 0 on success, -errno on failure. */ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { int ret; down_write(&kernfs_rwsem); ret = __kernfs_setattr(kn, iattr); up_write(&kernfs_rwsem); return ret; } int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct kernfs_node *kn = inode->i_private; int error; if (!kn) return -EINVAL; down_write(&kernfs_rwsem); error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) goto out; error = __kernfs_setattr(kn, iattr); if (error) goto out; /* this ignores size changes */ setattr_copy(&init_user_ns, inode, iattr); out: up_write(&kernfs_rwsem); return error; } ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); } static inline void set_inode_attr(struct inode *inode, struct kernfs_iattrs *attrs) { inode->i_uid = attrs->ia_uid; inode->i_gid = attrs->ia_gid; inode->i_atime = attrs->ia_atime; inode->i_mtime = attrs->ia_mtime; inode->i_ctime = attrs->ia_ctime; } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) { struct kernfs_iattrs *attrs = kn->iattr; inode->i_mode = kn->mode; if (attrs) /* * kernfs_node has non-default attributes get them from * persistent copy in kernfs_node. */ set_inode_attr(inode, attrs); if (kernfs_type(kn) == KERNFS_DIR) set_nlink(inode, kn->dir.subdirs + 2); } int kernfs_iop_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; down_read(&kernfs_rwsem); spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); generic_fillattr(&init_user_ns, inode, stat); spin_unlock(&inode->i_lock); up_read(&kernfs_rwsem); return 0; } static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) { kernfs_get(kn); inode->i_private = kn; inode->i_mapping->a_ops = &ram_aops; inode->i_op = &kernfs_iops; inode->i_generation = kernfs_gen(kn); set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); /* initialize inode according to type */ switch (kernfs_type(kn)) { case KERNFS_DIR: inode->i_op = &kernfs_dir_iops; inode->i_fop = &kernfs_dir_fops; if (kn->flags & KERNFS_EMPTY_DIR) make_empty_dir_inode(inode); break; case KERNFS_FILE: inode->i_size = kn->attr.size; inode->i_fop = &kernfs_file_fops; break; case KERNFS_LINK: inode->i_op = &kernfs_symlink_iops; break; default: BUG(); } unlock_new_inode(inode); } /** * kernfs_get_inode - get inode for kernfs_node * @sb: super block * @kn: kernfs_node to allocate inode for * * Get inode for @kn. If such inode doesn't exist, a new inode is * allocated and basics are initialized. New inode is returned * locked. * * LOCKING: * Kernel thread context (may sleep). * * RETURNS: * Pointer to allocated inode on success, NULL on failure. */ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; inode = iget_locked(sb, kernfs_ino(kn)); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); return inode; } /* * The kernfs_node serves as both an inode and a directory entry for * kernfs. To prevent the kernfs inode numbers from being freed * prematurely we take a reference to kernfs_node from the kernfs inode. A * super_operations.evict_inode() implementation is needed to drop that * reference upon inode destruction. */ void kernfs_evict_inode(struct inode *inode) { struct kernfs_node *kn = inode->i_private; truncate_inode_pages_final(&inode->i_data); clear_inode(inode); kernfs_put(kn); } int kernfs_iop_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct kernfs_node *kn; int ret; if (mask & MAY_NOT_BLOCK) return -ECHILD; kn = inode->i_private; down_read(&kernfs_rwsem); spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); ret = generic_permission(&init_user_ns, inode, mask); spin_unlock(&inode->i_lock); up_read(&kernfs_rwsem); return ret; } int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn); if (!attrs) return -ENODATA; return simple_xattr_get(&attrs->xattrs, name, value, size); } int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { struct kernfs_iattrs *attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL); } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *suffix, void *value, size_t size) { const char *name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; return kernfs_xattr_get(kn, name, value, size); } static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { const char *name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; return kernfs_xattr_set(kn, name, value, size, flags); } static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, const char *full_name, struct simple_xattrs *xattrs, const void *value, size_t size, int flags) { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; ssize_t removed_size; int ret; if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { ret = -ENOSPC; goto dec_count_out; } if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { ret = -ENOSPC; goto dec_size_out; } ret = simple_xattr_set(xattrs, full_name, value, size, flags, &removed_size); if (!ret && removed_size >= 0) size = removed_size; else if (!ret) return 0; dec_size_out: atomic_sub(size, sz); dec_count_out: atomic_dec(nr); return ret; } static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, const char *full_name, struct simple_xattrs *xattrs, const void *value, size_t size, int flags) { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; ssize_t removed_size; int ret; ret = simple_xattr_set(xattrs, full_name, value, size, flags, &removed_size); if (removed_size >= 0) { atomic_sub(removed_size, sz); atomic_dec(nr); } return ret; } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { const char *full_name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; struct kernfs_iattrs *attrs; if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR)) return -EOPNOTSUPP; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; if (value) return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs, value, size, flags); else return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs, value, size, flags); } static const struct xattr_handler kernfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_xattr_set, }; static const struct xattr_handler kernfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_xattr_set, }; static const struct xattr_handler kernfs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_user_xattr_set, }; const struct xattr_handler *kernfs_xattr_handlers[] = { &kernfs_trusted_xattr_handler, &kernfs_security_xattr_handler, &kernfs_user_xattr_handler, NULL }; |
19 19 3 8 7 6 15 13 19 19 15 13 3 2 1 320 319 320 22 22 9 19 19 22 22 22 19 14 8 22 22 12 2 22 8 22 13 22 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3: Garbage Collector For AF_UNIX sockets * * Garbage Collector: * Copyright (C) Barak A. Pearlmutter. * * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem. * If it doesn't work blame me, it worked when Barak sent it. * * Assumptions: * * - object w/ a bit * - free list * * Current optimizations: * * - explicit stack instead of recursion * - tail recurse on first born instead of immediate push/pop * - we gather the stuff that should not be killed into tree * and stack is just a path from root to the current pointer. * * Future optimizations: * * - don't just push entire root set; process in place * * Fixes: * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed. * Cope with changing max_files. * Al Viro 11 Oct 1998 * Graph may have cycles. That is, we can send the descriptor * of foo to bar and vice versa. Current code chokes on that. * Fix: move SCM_RIGHTS ones into the separate list and then * skb_free() them all instead of doing explicit fput's. * Another problem: since fput() may block somebody may * create a new unix_socket when we are in the middle of sweep * phase. Fix: revert the logic wrt MARKED. Mark everything * upon the beginning and unmark non-junk ones. * * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS * sent to connect()'ed but still not accept()'ed sockets. * Fixed. Old code had slightly different problem here: * extra fput() in situation when we passed the descriptor via * such socket and closed it (descriptor). That would happen on * each unix_gc() until the accept(). Since the struct file in * question would go to the free list and might be reused... * That might be the reason of random oopses on filp_close() * in unrelated processes. * * AV 28 Feb 1999 * Kill the explicit allocation of stack. Now we keep the tree * with root in dummy + pointer (gc_current) to one of the nodes. * Stack is represented as path from gc_current to dummy. Unmark * now means "add to tree". Push == "make it a son of gc_current". * Pop == "move gc_current to parent". We keep only pointers to * parents (->gc_tree). * AV 1 Mar 1999 * Damn. Added missing check for ->dead in listen queues scanning. * * Miklos Szeredi 25 Jun 2007 * Reimplement with a cycle collecting algorithm. This should * solve several problems with the previous code, like being racy * wrt receive and holding up unrelated socket operations. */ #include <linux/kernel.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/net.h> #include <linux/fs.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/file.h> #include <linux/proc_fs.h> #include <linux/mutex.h> #include <linux/wait.h> #include <net/sock.h> #include <net/af_unix.h> #include <net/scm.h> #include <net/tcp_states.h> #include "scm.h" /* Internal data structures and random procedures: */ static LIST_HEAD(gc_candidates); static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { struct sk_buff *skb; struct sk_buff *next; spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { /* Do we have file descriptors ? */ if (UNIXCB(skb).fp) { bool hit = false; /* Process the descriptors of this socket */ int nfd = UNIXCB(skb).fp->count; struct file **fp = UNIXCB(skb).fp->fp; while (nfd--) { /* Get the socket the fd matches if it indeed does so */ struct sock *sk = unix_get_socket(*fp++); if (sk) { struct unix_sock *u = unix_sk(sk); /* Ignore non-candidates, they could * have been added to the queues after * starting the garbage collection */ if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { hit = true; func(u); } } } if (hit && hitlist != NULL) { __skb_unlink(skb, &x->sk_receive_queue); __skb_queue_tail(hitlist, skb); } } } spin_unlock(&x->sk_receive_queue.lock); } static void scan_children(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { if (x->sk_state != TCP_LISTEN) { scan_inflight(x, func, hitlist); } else { struct sk_buff *skb; struct sk_buff *next; struct unix_sock *u; LIST_HEAD(embryos); /* For a listening socket collect the queued embryos * and perform a scan on them as well. */ spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { u = unix_sk(skb->sk); /* An embryo cannot be in-flight, so it's safe * to use the list link. */ BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &embryos); } spin_unlock(&x->sk_receive_queue.lock); while (!list_empty(&embryos)) { u = list_entry(embryos.next, struct unix_sock, link); scan_inflight(&u->sk, func, hitlist); list_del_init(&u->link); } } } static void dec_inflight(struct unix_sock *usk) { usk->inflight--; } static void inc_inflight(struct unix_sock *usk) { usk->inflight++; } static void inc_inflight_move_tail(struct unix_sock *u) { u->inflight++; /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over */ if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) list_move_tail(&u->link, &gc_candidates); } static bool gc_in_progress; #define UNIX_INFLIGHT_TRIGGER_GC 16000 void wait_for_unix_gc(void) { /* If number of inflight sockets is insane, * force a garbage collect right now. * Paired with the WRITE_ONCE() in unix_inflight(), * unix_notinflight() and gc_in_progress(). */ if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && !READ_ONCE(gc_in_progress)) unix_gc(); wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress)); } /* The external entry point: unix_gc() */ void unix_gc(void) { struct sk_buff *next_skb, *skb; struct unix_sock *u; struct unix_sock *next; struct sk_buff_head hitlist; struct list_head cursor; LIST_HEAD(not_cycle_list); spin_lock(&unix_gc_lock); /* Avoid a recursive GC. */ if (gc_in_progress) goto out; /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, true); /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. * * Holding unix_gc_lock will protect these candidates from * being detached, and hence from gaining an external * reference. Since there are no possible receivers, all * buffers currently on the candidates' queues stay there * during the garbage collection. * * We also know that no new candidate can be added onto the * receive queues. Other, non candidate sockets _can_ be * added to queue, so we must make sure only to touch * candidates. * * Embryos, though never candidates themselves, affect which * candidates are reachable by the garbage collector. Before * being added to a listener's queue, an embryo may already * receive data carrying SCM_RIGHTS, potentially making the * passed socket a candidate that is not yet reachable by the * collector. It becomes reachable once the embryo is * enqueued. Therefore, we must ensure that no SCM-laden * embryo appears in a (candidate) listener's queue between * consecutive scan_children() calls. */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { struct sock *sk = &u->sk; long total_refs; total_refs = file_count(sk->sk_socket->file); BUG_ON(!u->inflight); BUG_ON(total_refs < u->inflight); if (total_refs == u->inflight) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); if (sk->sk_state == TCP_LISTEN) { unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); unix_state_unlock(sk); } } } /* Now remove all internal in-flight reference to children of * the candidates. */ list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, dec_inflight, NULL); /* Restore the references for children of all candidates, * which have remaining references. Do this recursively, so * only those remain, which form cyclic references. * * Use a "cursor" link, to make the list traversal safe, even * though elements might be moved about. */ list_add(&cursor, &gc_candidates); while (cursor.next != &gc_candidates) { u = list_entry(cursor.next, struct unix_sock, link); /* Move cursor to after the current position. */ list_move(&cursor, &u->link); if (u->inflight) { list_move_tail(&u->link, ¬_cycle_list); __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); scan_children(&u->sk, inc_inflight_move_tail, NULL); } } list_del(&cursor); /* Now gc_candidates contains only garbage. Restore original * inflight counters for these as well, and remove the skbuffs * which are creating the cycle(s). */ skb_queue_head_init(&hitlist); list_for_each_entry(u, &gc_candidates, link) { scan_children(&u->sk, inc_inflight, &hitlist); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (u->oob_skb) { kfree_skb(u->oob_skb); u->oob_skb = NULL; } #endif } /* not_cycle_list contains those sockets which do not make up a * cycle. Restore these to the inflight list. */ while (!list_empty(¬_cycle_list)) { u = list_entry(not_cycle_list.next, struct unix_sock, link); __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); list_move_tail(&u->link, &gc_inflight_list); } spin_unlock(&unix_gc_lock); /* We need io_uring to clean its registered files, ignore all io_uring * originated skbs. It's fine as io_uring doesn't keep references to * other io_uring instances and so killing all other files in the cycle * will put all io_uring references forcing it to go through normal * release.path eventually putting registered files. */ skb_queue_walk_safe(&hitlist, skb, next_skb) { if (skb->scm_io_uring) { __skb_unlink(skb, &hitlist); skb_queue_tail(&skb->sk->sk_receive_queue, skb); } } /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); spin_lock(&unix_gc_lock); /* There could be io_uring registered files, just push them back to * the inflight list */ list_for_each_entry_safe(u, next, &gc_candidates, link) list_move_tail(&u->link, &gc_inflight_list); /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); wake_up(&unix_gc_wait); out: spin_unlock(&unix_gc_lock); } |
6 42 76 76 76 76 25 51 131 5 126 126 126 120 6 6 6 6 6 6 106 98 15 80 2 16 34 34 34 34 34 34 374 358 18 18 18 18 132 118 7 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 | // SPDX-License-Identifier: GPL-2.0-or-later /* * lwtunnel Infrastructure for light weight tunnels like mpls * * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/lwtunnel.h> #include <linux/in.h> #include <linux/init.h> #include <linux/err.h> #include <net/lwtunnel.h> #include <net/rtnetlink.h> #include <net/ip6_fib.h> #include <net/rtnh.h> DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); #ifdef CONFIG_MODULES static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) { /* Only lwt encaps implemented without using an interface for * the encap need to return a string here. */ switch (encap_type) { case LWTUNNEL_ENCAP_MPLS: return "MPLS"; case LWTUNNEL_ENCAP_ILA: return "ILA"; case LWTUNNEL_ENCAP_SEG6: return "SEG6"; case LWTUNNEL_ENCAP_BPF: return "BPF"; case LWTUNNEL_ENCAP_SEG6_LOCAL: return "SEG6LOCAL"; case LWTUNNEL_ENCAP_RPL: return "RPL"; case LWTUNNEL_ENCAP_IOAM6: return "IOAM6"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: case __LWTUNNEL_ENCAP_MAX: /* should not have got here */ WARN_ON(1); break; } return NULL; } #endif /* CONFIG_MODULES */ struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { struct lwtunnel_state *lws; lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); return lws; } EXPORT_SYMBOL_GPL(lwtunnel_state_alloc); static const struct lwtunnel_encap_ops __rcu * lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, unsigned int num) { if (num > LWTUNNEL_ENCAP_MAX) return -ERANGE; return !cmpxchg((const struct lwtunnel_encap_ops **) &lwtun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, unsigned int encap_type) { int ret; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) return -ERANGE; ret = (cmpxchg((const struct lwtunnel_encap_ops **) &lwtun_encaps[encap_type], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; bool found = false; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) { NL_SET_ERR_MSG_ATTR(extack, encap, "Unknown LWT encapsulation type"); return ret; } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); if (likely(ops && ops->build_state && try_module_get(ops->owner))) found = true; rcu_read_unlock(); if (found) { ret = ops->build_state(net, encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); } else { /* don't rely on -EOPNOTSUPP to detect match as build_state * handlers could return it */ NL_SET_ERR_MSG_ATTR(extack, encap, "LWT encapsulation type not supported"); } return ret; } EXPORT_SYMBOL_GPL(lwtunnel_build_state); int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) { NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type"); return ret; } rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); rcu_read_unlock(); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); rtnl_lock(); rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); rcu_read_unlock(); } } #endif ret = ops ? 0 : -EOPNOTSUPP; if (ret < 0) NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; struct nlattr *attrs; u16 encap_type; int attrlen; while (rtnh_ok(rtnh, remaining)) { attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { attrs = rtnh_attrs(rtnh); nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla_entype) { if (nla_len(nla_entype) < sizeof(u16)) { NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE"); return -EINVAL; } encap_type = nla_get_u16(nla_entype); if (lwtunnel_valid_encap_type(encap_type, extack) != 0) return -EOPNOTSUPP; } } rtnh = rtnh_next(rtnh, &remaining); } return 0; } EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr); void lwtstate_free(struct lwtunnel_state *lws) { const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type]; if (ops->destroy_state) { ops->destroy_state(lws); kfree_rcu(lws, rcu); } else { kfree(lws); } module_put(ops->owner); } EXPORT_SYMBOL_GPL(lwtstate_free); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr) { const struct lwtunnel_encap_ops *ops; struct nlattr *nest; int ret; if (!lwtstate) return 0; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; nest = nla_nest_start_noflag(skb, encap_attr); if (!nest) return -EMSGSIZE; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->fill_encap)) ret = ops->fill_encap(skb, lwtstate); rcu_read_unlock(); if (ret) goto nla_put_failure; nla_nest_end(skb, nest); ret = nla_put_u16(skb, encap_type_attr, lwtstate->type); if (ret) goto nla_put_failure; return 0; nla_put_failure: nla_nest_cancel(skb, nest); return (ret == -EOPNOTSUPP ? 0 : ret); } EXPORT_SYMBOL_GPL(lwtunnel_fill_encap); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) { const struct lwtunnel_encap_ops *ops; int ret = 0; if (!lwtstate) return 0; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->get_encap_size)) ret = nla_total_size(ops->get_encap_size(lwtstate)); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { const struct lwtunnel_encap_ops *ops; int ret = 0; if (!a && !b) return 0; if (!a || !b) return 1; if (a->type != b->type) return 1; if (a->type == LWTUNNEL_ENCAP_NONE || a->type > LWTUNNEL_ENCAP_MAX) return 0; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[a->type]); if (likely(ops && ops->cmp_encap)) ret = ops->cmp_encap(a, b); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; int ret = -EINVAL; if (!dst) goto drop; lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->output)) ret = ops->output(net, sk, skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; int ret = -EINVAL; if (!dst) goto drop; lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->xmit)) ret = ops->xmit(skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; int ret = -EINVAL; if (!dst) goto drop; lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->input)) ret = ops->input(skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_input); |
1 1 1 9 1 3 3 1 2 2 1 14 3 12 13 2 1 1 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 | // SPDX-License-Identifier: GPL-2.0-only /* * DCCP connection tracking protocol helper * * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/dccp.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> /* Timeouts are based on values from RFC4340: * * - REQUEST: * * 8.1.2. Client Request * * A client MAY give up on its DCCP-Requests after some time * (3 minutes, for example). * * - RESPOND: * * 8.1.3. Server Response * * It MAY also leave the RESPOND state for CLOSED after a timeout of * not less than 4MSL (8 minutes); * * - PARTOPEN: * * 8.1.5. Handshake Completion * * If the client remains in PARTOPEN for more than 4MSL (8 minutes), * it SHOULD reset the connection with Reset Code 2, "Aborted". * * - OPEN: * * The DCCP timestamp overflows after 11.9 hours. If the connection * stays idle this long the sequence number won't be recognized * as valid anymore. * * - CLOSEREQ/CLOSING: * * 8.3. Termination * * The retransmission timer should initially be set to go off in two * round-trip times and should back off to not less than once every * 64 seconds ... * * - TIMEWAIT: * * 4.3. States * * A server or client socket remains in this state for 2MSL (4 minutes) * after the connection has been town down, ... */ #define DCCP_MSL (2 * 60 * HZ) static const char * const dccp_state_names[] = { [CT_DCCP_NONE] = "NONE", [CT_DCCP_REQUEST] = "REQUEST", [CT_DCCP_RESPOND] = "RESPOND", [CT_DCCP_PARTOPEN] = "PARTOPEN", [CT_DCCP_OPEN] = "OPEN", [CT_DCCP_CLOSEREQ] = "CLOSEREQ", [CT_DCCP_CLOSING] = "CLOSING", [CT_DCCP_TIMEWAIT] = "TIMEWAIT", [CT_DCCP_IGNORE] = "IGNORE", [CT_DCCP_INVALID] = "INVALID", }; #define sNO CT_DCCP_NONE #define sRQ CT_DCCP_REQUEST #define sRS CT_DCCP_RESPOND #define sPO CT_DCCP_PARTOPEN #define sOP CT_DCCP_OPEN #define sCR CT_DCCP_CLOSEREQ #define sCG CT_DCCP_CLOSING #define sTW CT_DCCP_TIMEWAIT #define sIG CT_DCCP_IGNORE #define sIV CT_DCCP_INVALID /* * DCCP state transition table * * The assumption is the same as for TCP tracking: * * We are the man in the middle. All the packets go through us but might * get lost in transit to the destination. It is assumed that the destination * can't receive segments we haven't seen. * * The following states exist: * * NONE: Initial state, expecting Request * REQUEST: Request seen, waiting for Response from server * RESPOND: Response from server seen, waiting for Ack from client * PARTOPEN: Ack after Response seen, waiting for packet other than Response, * Reset or Sync from server * OPEN: Packet other than Response, Reset or Sync seen * CLOSEREQ: CloseReq from server seen, expecting Close from client * CLOSING: Close seen, expecting Reset * TIMEWAIT: Reset seen * IGNORE: Not determinable whether packet is valid * * Some states exist only on one side of the connection: REQUEST, RESPOND, * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to * the one it was in before. * * Packets are marked as ignored (sIG) if we don't know if they're valid * (for example a reincarnation of a connection we didn't notice is dead * already) and the server may send back a connection closing Reset or a * Response. They're also used for Sync/SyncAck packets, which we don't * care about. */ static const u_int8_t dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = { [CT_DCCP_ROLE_CLIENT] = { [DCCP_PKT_REQUEST] = { /* * sNO -> sRQ Regular Request * sRQ -> sRQ Retransmitted Request or reincarnation * sRS -> sRS Retransmitted Request (apparently Response * got lost after we saw it) or reincarnation * sPO -> sIG Ignore, conntrack might be out of sync * sOP -> sIG Ignore, conntrack might be out of sync * sCR -> sIG Ignore, conntrack might be out of sync * sCG -> sIG Ignore, conntrack might be out of sync * sTW -> sRQ Reincarnation * * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */ sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ, }, [DCCP_PKT_RESPONSE] = { /* * sNO -> sIV Invalid * sRQ -> sIG Ignore, might be response to ignored Request * sRS -> sIG Ignore, might be response to ignored Request * sPO -> sIG Ignore, might be response to ignored Request * sOP -> sIG Ignore, might be response to ignored Request * sCR -> sIG Ignore, might be response to ignored Request * sCG -> sIG Ignore, might be response to ignored Request * sTW -> sIV Invalid, reincarnation in reverse direction * goes through sRQ * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV, }, [DCCP_PKT_ACK] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN * sOP -> sOP Regular ACK, remain in OPEN * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.) * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV }, [DCCP_PKT_DATA] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.) * sOP -> sOP Regular Data packet * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.) * sCG -> sCG Data in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV, }, [DCCP_PKT_DATAACK] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) * sPO -> sPO Remain in PARTOPEN state * sOP -> sOP Regular DataAck packet in OPEN state * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.) * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV }, [DCCP_PKT_CLOSEREQ] = { /* * CLOSEREQ may only be sent by the server. * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, [DCCP_PKT_CLOSE] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sCG Client-initiated close * sOP -> sCG Client-initiated close * sCR -> sCG Close in response to CloseReq (8.3.) * sCG -> sCG Retransmit * sTW -> sIV Late retransmit, already in TIME_WAIT * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV }, [DCCP_PKT_RESET] = { /* * sNO -> sIV No connection * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.) * sRS -> sTW Response received without Request * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.) * sOP -> sTW Connection reset * sCR -> sTW Connection reset * sCG -> sTW Connection reset * sTW -> sIG Ignore (don't refresh timer) * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG }, [DCCP_PKT_SYNC] = { /* * We currently ignore Sync packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, [DCCP_PKT_SYNCACK] = { /* * We currently ignore SyncAck packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, }, [CT_DCCP_ROLE_SERVER] = { [DCCP_PKT_REQUEST] = { /* * sNO -> sIV Invalid * sRQ -> sIG Ignore, conntrack might be out of sync * sRS -> sIG Ignore, conntrack might be out of sync * sPO -> sIG Ignore, conntrack might be out of sync * sOP -> sIG Ignore, conntrack might be out of sync * sCR -> sIG Ignore, conntrack might be out of sync * sCG -> sIG Ignore, conntrack might be out of sync * sTW -> sRQ Reincarnation, must reverse roles * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ }, [DCCP_PKT_RESPONSE] = { /* * sNO -> sIV Response without Request * sRQ -> sRS Response to clients Request * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT) * sPO -> sIG Response to an ignored Request or late retransmit * sOP -> sIG Ignore, might be response to ignored Request * sCR -> sIG Ignore, might be response to ignored Request * sCG -> sIG Ignore, might be response to ignored Request * sTW -> sIV Invalid, Request from client in sTW moves to sRQ * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV }, [DCCP_PKT_ACK] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sOP Enter OPEN state (8.1.5.) * sOP -> sOP Regular Ack in OPEN state * sCR -> sIV Waiting for Close from client * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV }, [DCCP_PKT_DATA] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sOP Enter OPEN state (8.1.5.) * sOP -> sOP Regular Data packet in OPEN state * sCR -> sIV Waiting for Close from client * sCG -> sCG Data in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV }, [DCCP_PKT_DATAACK] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sOP Enter OPEN state (8.1.5.) * sOP -> sOP Regular DataAck in OPEN state * sCR -> sIV Waiting for Close from client * sCG -> sCG Data in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV }, [DCCP_PKT_CLOSEREQ] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.) * sOP -> sCR CloseReq in OPEN state * sCR -> sCR Retransmit * sCG -> sCR Simultaneous close, client sends another Close * sTW -> sIV Already closed * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV }, [DCCP_PKT_CLOSE] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sOP -> sCG Move direcly to CLOSING * sOP -> sCG Move to CLOSING * sCR -> sIV Close after CloseReq is invalid * sCG -> sCG Retransmit * sTW -> sIV Already closed * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV }, [DCCP_PKT_RESET] = { /* * sNO -> sIV No connection * sRQ -> sTW Reset in response to Request * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.) * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.) * sOP -> sTW * sCR -> sTW * sCG -> sTW * sTW -> sIG Ignore (don't refresh timer) * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */ sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG }, [DCCP_PKT_SYNC] = { /* * We currently ignore Sync packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, [DCCP_PKT_SYNCACK] = { /* * We currently ignore SyncAck packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, }, }; static noinline bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct dccp_hdr *dh, const struct nf_hook_state *hook_state) { struct net *net = nf_ct_net(ct); struct nf_dccp_net *dn; const char *msg; u_int8_t state; state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; switch (state) { default: dn = nf_dccp_pernet(net); if (dn->dccp_loose == 0) { msg = "not picking up existing connection "; goto out_invalid; } break; case CT_DCCP_REQUEST: break; case CT_DCCP_INVALID: msg = "invalid state transition "; goto out_invalid; } ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; ct->proto.dccp.state = CT_DCCP_NONE; ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; ct->proto.dccp.handshake_seq = 0; return true; out_invalid: nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", msg); return false; } static u64 dccp_ack_seq(const struct dccp_hdr *dh) { const struct dccp_hdr_ack_bits *dhack; dhack = (void *)dh + __dccp_basic_hdr_len(dh); return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low); } static bool dccp_error(const struct dccp_hdr *dh, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { static const unsigned long require_seq48 = 1 << DCCP_PKT_REQUEST | 1 << DCCP_PKT_RESPONSE | 1 << DCCP_PKT_CLOSEREQ | 1 << DCCP_PKT_CLOSE | 1 << DCCP_PKT_RESET | 1 << DCCP_PKT_SYNC | 1 << DCCP_PKT_SYNCACK; unsigned int dccp_len = skb->len - dataoff; unsigned int cscov; const char *msg; u8 type; BUILD_BUG_ON(DCCP_PKT_INVALID >= BITS_PER_LONG); if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || dh->dccph_doff * 4 > dccp_len) { msg = "nf_ct_dccp: truncated/malformed packet "; goto out_invalid; } cscov = dccp_len; if (dh->dccph_cscov) { cscov = (dh->dccph_cscov - 1) * 4; if (cscov > dccp_len) { msg = "nf_ct_dccp: bad checksum coverage "; goto out_invalid; } } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_DCCP, state->pf)) { msg = "nf_ct_dccp: bad checksum "; goto out_invalid; } type = dh->dccph_type; if (type >= DCCP_PKT_INVALID) { msg = "nf_ct_dccp: reserved packet type "; goto out_invalid; } if (test_bit(type, &require_seq48) && !dh->dccph_x) { msg = "nf_ct_dccp: type lacks 48bit sequence numbers"; goto out_invalid; } return false; out_invalid: nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg); return true; } struct nf_conntrack_dccp_buf { struct dccp_hdr dh; /* generic header part */ struct dccp_hdr_ext ext; /* optional depending dh->dccph_x */ union { /* depends on header type */ struct dccp_hdr_ack_bits ack; struct dccp_hdr_request req; struct dccp_hdr_response response; struct dccp_hdr_reset rst; } u; }; static struct dccp_hdr * dccp_header_pointer(const struct sk_buff *skb, int offset, const struct dccp_hdr *dh, struct nf_conntrack_dccp_buf *buf) { unsigned int hdrlen = __dccp_hdr_len(dh); if (hdrlen > sizeof(*buf)) return NULL; return skb_header_pointer(skb, offset, hdrlen, buf); } int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conntrack_dccp_buf _dh; u_int8_t type, old_state, new_state; enum ct_dccp_roles role; unsigned int *timeouts; struct dccp_hdr *dh; dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh); if (!dh) return NF_DROP; if (dccp_error(dh, skb, dataoff, state)) return -NF_ACCEPT; /* pull again, including possible 48 bit sequences and subtype header */ dh = dccp_header_pointer(skb, dataoff, dh, &_dh); if (!dh) return NF_DROP; type = dh->dccph_type; if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) return -NF_ACCEPT; if (type == DCCP_PKT_RESET && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* Tear down connection immediately if only reply is a RESET */ nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } spin_lock_bh(&ct->lock); role = ct->proto.dccp.role[dir]; old_state = ct->proto.dccp.state; new_state = dccp_state_table[role][type][old_state]; switch (new_state) { case CT_DCCP_REQUEST: if (old_state == CT_DCCP_TIMEWAIT && role == CT_DCCP_ROLE_SERVER) { /* Reincarnation in the reverse direction: reopen and * reverse client/server roles. */ ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT; ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER; } break; case CT_DCCP_RESPOND: if (old_state == CT_DCCP_REQUEST) ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); break; case CT_DCCP_PARTOPEN: if (old_state == CT_DCCP_RESPOND && type == DCCP_PKT_ACK && dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq) set_bit(IPS_ASSURED_BIT, &ct->status); break; case CT_DCCP_IGNORE: /* * Connection tracking might be out of sync, so we ignore * packets that might establish a new connection and resync * if the server responds with a valid Response. */ if (ct->proto.dccp.last_dir == !dir && ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST && type == DCCP_PKT_RESPONSE) { ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT; ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER; ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); new_state = CT_DCCP_RESPOND; break; } ct->proto.dccp.last_dir = dir; ct->proto.dccp.last_pkt = type; spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid packet"); return NF_ACCEPT; case CT_DCCP_INVALID: spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid state transition"); return -NF_ACCEPT; } ct->proto.dccp.last_dir = dir; ct->proto.dccp.last_pkt = type; ct->proto.dccp.state = new_state; spin_unlock_bh(&ct->lock); if (new_state != old_state) nf_conntrack_event_cache(IPCT_PROTOINFO, ct); timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; } static bool dccp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.dccp.state) { case CT_DCCP_CLOSEREQ: case CT_DCCP_CLOSING: case CT_DCCP_TIMEWAIT: return true; default: break; } return false; } #ifdef CONFIG_NF_CONNTRACK_PROCFS static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]); } #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, cpu_to_be64(ct->proto.dccp.handshake_seq), CTA_PROTOINFO_DCCP_PAD)) goto nla_put_failure; skip_state: nla_nest_end(skb, nest_parms); spin_unlock_bh(&ct->lock); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC }, }; #define DCCP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \ NLA_ALIGN(NLA_HDRLEN + 0)) static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_DCCP]; struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1]; int err; if (!attr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_DCCP_MAX, attr, dccp_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_PROTOINFO_DCCP_STATE] || !tb[CTA_PROTOINFO_DCCP_ROLE] || nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { return -EINVAL; } spin_lock_bh(&ct->lock); ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; } else { ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; } if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) { ct->proto.dccp.handshake_seq = be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ])); } spin_unlock_bh(&ct->lock); return 0; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { struct nf_dccp_net *dn = nf_dccp_pernet(net); unsigned int *timeouts = data; int i; if (!timeouts) timeouts = dn->dccp_timeout; /* set default DCCP timeouts. */ for (i=0; i<CT_DCCP_MAX; i++) timeouts[i] = dn->dccp_timeout[i]; /* there's a 1:1 mapping between attributes and protocol states. */ for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { if (tb[i]) { timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST]; return 0; } static int dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; int i; for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) goto nla_put_failure; } return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_dccp_init_net(struct net *net) { struct nf_dccp_net *dn = nf_dccp_pernet(net); /* default values */ dn->dccp_loose = 1; dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; /* timeouts[0] is unused, make it same as SYN_SENT so * ->timeouts[0] contains 'new' timeout, like udp or icmp. */ dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = { .l4proto = IPPROTO_DCCP, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = dccp_print_conntrack, #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = DCCP_NLATTR_SIZE, .to_nlattr = dccp_to_nlattr, .from_nlattr = nlattr_to_dccp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = dccp_timeout_nlattr_to_obj, .obj_to_nlattr = dccp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_DCCP_MAX, .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, .nla_policy = dccp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | // SPDX-License-Identifier: GPL-2.0-only /* * Common code for control of lockd and nfsv4 grace periods. * * Transplanted from lockd code */ #include <linux/module.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/fs.h> static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); /** * locks_start_grace * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * A grace period is a period during which locks should not be given * out. Currently grace periods are only enforced by the two lock * managers (lockd and nfsd), using the locks_in_grace() function to * check when they are in a grace period. * * This function is called to start a grace period. */ void locks_start_grace(struct net *net, struct lock_manager *lm) { struct list_head *grace_list = net_generic(net, grace_net_id); spin_lock(&grace_lock); if (list_empty(&lm->list)) list_add(&lm->list, grace_list); else WARN(1, "double list_add attempt detected in net %x %s\n", net->ns.inum, (net == &init_net) ? "(init_net)" : ""); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); /** * locks_end_grace * @lm: who this grace period is for * * Call this function to state that the given lock manager is ready to * resume regular locking. The grace period will not end until all lock * managers that called locks_start_grace() also call locks_end_grace(). * Note that callers count on it being safe to call this more than once, * and the second call should be a no-op. */ void locks_end_grace(struct lock_manager *lm) { spin_lock(&grace_lock); list_del_init(&lm->list); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_end_grace); static bool __state_in_grace(struct net *net, bool open) { struct list_head *grace_list = net_generic(net, grace_net_id); struct lock_manager *lm; if (!open) return !list_empty(grace_list); spin_lock(&grace_lock); list_for_each_entry(lm, grace_list, list) { if (lm->block_opens) { spin_unlock(&grace_lock); return true; } } spin_unlock(&grace_lock); return false; } /** * locks_in_grace * @net: network namespace * * Lock managers call this function to determine when it is OK for them * to answer ordinary lock requests, and when they should accept only * lock reclaims. */ bool locks_in_grace(struct net *net) { return __state_in_grace(net, false); } EXPORT_SYMBOL_GPL(locks_in_grace); bool opens_in_grace(struct net *net) { return __state_in_grace(net, true); } EXPORT_SYMBOL_GPL(opens_in_grace); static int __net_init grace_init_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); INIT_LIST_HEAD(grace_list); return 0; } static void __net_exit grace_exit_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); WARN_ONCE(!list_empty(grace_list), "net %x %s: grace_list is not empty\n", net->ns.inum, __func__); } static struct pernet_operations grace_net_ops = { .init = grace_init_net, .exit = grace_exit_net, .id = &grace_net_id, .size = sizeof(struct list_head), }; static int __init init_grace(void) { return register_pernet_subsys(&grace_net_ops); } static void __exit exit_grace(void) { unregister_pernet_subsys(&grace_net_ops); } MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>"); MODULE_LICENSE("GPL"); module_init(init_grace) module_exit(exit_grace) |
44 5 91 90 55 3 54 2 2 6 2 1 2 2 3 7 26 12 21 25 2 3 7 26 26 26 26 26 1 4 21 6 6 6 6 6 4 3 2 2 1 1 3 2 2 3 3 1 1 1 1 2 2 3 70 70 65 45 21 10 10 10 22 22 9 9 17 10 7 6 3 7 7 11 6 8 9 9 9 9 9 9 9 9 9 9 9 9 268 261 4 3 342 340 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Extension Header handling for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Andi Kleen <ak@muc.de> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ /* Changes: * yoshfuji : ensure not to overrun while parsing * tlv options. * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs(). * YOSHIFUJI Hideaki @USAGI Register inbound extension header * handlers as inet6_protocol{}. */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/slab.h> #include <linux/export.h> #include <net/dst.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/calipso.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/xfrm.h> #endif #include <linux/seg6.h> #include <net/seg6.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif #include <net/rpl.h> #include <linux/ioam6.h> #include <net/ioam6.h> #include <net/dst_metadata.h> #include <linux/uaccess.h> /********************* Generic functions *********************/ /* An unknown option is detected, decide what to do */ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff, bool disallow_unknowns) { if (disallow_unknowns) { /* If unknown TLVs are disallowed by configuration * then always silently drop packet. Note this also * means no ICMP parameter problem is sent which * could be a good property to mitigate a reflection DOS * attack. */ goto drop; } switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { case 0: /* ignore */ return true; case 1: /* drop packet */ break; case 3: /* Send ICMP if not a multicast address and drop packet */ /* Actually, it is redundant check. icmp_send will recheck in any case. */ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) break; fallthrough; case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); return false; } drop: kfree_skb(skb); return false; } static bool ipv6_hop_ra(struct sk_buff *skb, int optoff); static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff); static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff); static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff); #if IS_ENABLED(CONFIG_IPV6_MIP6) static bool ipv6_dest_hao(struct sk_buff *skb, int optoff); #endif /* Parse tlv encoded option header (hop-by-hop or destination) */ static bool ip6_parse_tlv(bool hopbyhop, struct sk_buff *skb, int max_count) { int len = (skb_transport_header(skb)[1] + 1) << 3; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); bool disallow_unknowns = false; int tlv_count = 0; int padlen = 0; if (unlikely(max_count < 0)) { disallow_unknowns = true; max_count = -max_count; } if (skb_transport_offset(skb) + len > skb_headlen(skb)) goto bad; off += 2; len -= 2; while (len > 0) { int optlen, i; if (nh[off] == IPV6_TLV_PAD1) { padlen++; if (padlen > 7) goto bad; off++; len--; continue; } if (len < 2) goto bad; optlen = nh[off + 1] + 2; if (optlen > len) goto bad; if (nh[off] == IPV6_TLV_PADN) { /* RFC 2460 states that the purpose of PadN is * to align the containing header to multiples * of 8. 7 is therefore the highest valid value. * See also RFC 4942, Section 2.1.9.5. */ padlen += optlen; if (padlen > 7) goto bad; /* RFC 4942 recommends receiving hosts to * actively check PadN payload to contain * only zeroes. */ for (i = 2; i < optlen; i++) { if (nh[off + i] != 0) goto bad; } } else { tlv_count++; if (tlv_count > max_count) goto bad; if (hopbyhop) { switch (nh[off]) { case IPV6_TLV_ROUTERALERT: if (!ipv6_hop_ra(skb, off)) return false; break; case IPV6_TLV_IOAM: if (!ipv6_hop_ioam(skb, off)) return false; break; case IPV6_TLV_JUMBO: if (!ipv6_hop_jumbo(skb, off)) return false; break; case IPV6_TLV_CALIPSO: if (!ipv6_hop_calipso(skb, off)) return false; break; default: if (!ip6_tlvopt_unknown(skb, off, disallow_unknowns)) return false; break; } } else { switch (nh[off]) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_TLV_HAO: if (!ipv6_dest_hao(skb, off)) return false; break; #endif default: if (!ip6_tlvopt_unknown(skb, off, disallow_unknowns)) return false; break; } } padlen = 0; } off += optlen; len -= optlen; } if (len == 0) return true; bad: kfree_skb(skb); return false; } /***************************** Destination options header. *****************************/ #if IS_ENABLED(CONFIG_IPV6_MIP6) static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) { struct ipv6_destopt_hao *hao; struct inet6_skb_parm *opt = IP6CB(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); int ret; if (opt->dsthao) { net_dbg_ratelimited("hao duplicated\n"); goto discard; } opt->dsthao = opt->dst1; opt->dst1 = 0; hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); if (hao->length != 16) { net_dbg_ratelimited("hao invalid option length = %d\n", hao->length); goto discard; } if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) { net_dbg_ratelimited("hao is not an unicast addr: %pI6\n", &hao->addr); goto discard; } ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr, (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS); if (unlikely(ret < 0)) goto discard; if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) goto discard; /* update all variable using below by copied skbuff */ hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); ipv6h = ipv6_hdr(skb); } if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; swap(ipv6h->saddr, hao->addr); if (skb->tstamp == 0) __net_timestamp(skb); return true; discard: kfree_skb(skb); return false; } #endif static int ipv6_destopt_rcv(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) __u16 dstbuf; #endif struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(skb->dev); int extlen; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(dev_net(dst->dev), idev, IPSTATS_MIB_INHDRERRORS); fail_and_free: kfree_skb(skb); return -1; } extlen = (skb_transport_header(skb)[1] + 1) << 3; if (extlen > net->ipv6.sysctl.max_dst_opts_len) goto fail_and_free; opt->lastopt = opt->dst1 = skb_network_header_len(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) dstbuf = opt->dst1; #endif if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) opt->nhoff = dstbuf; #else opt->nhoff = opt->dst1; #endif return 1; } __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return -1; } static void seg6_update_csum(struct sk_buff *skb) { struct ipv6_sr_hdr *hdr; struct in6_addr *addr; __be32 from, to; /* srh is at transport offset and seg_left is already decremented * but daddr is not yet updated with next segment */ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); addr = hdr->segments + hdr->segments_left; hdr->segments_left++; from = *(__be32 *)hdr; hdr->segments_left--; to = *(__be32 *)hdr; /* update skb csum with diff resulting from seg_left decrement */ update_csum_diff4(skb, from, to); /* compute csum diff between current and next segment and update */ update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr), (__be32 *)addr); } static int ipv6_srh_rcv(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); struct ipv6_sr_hdr *hdr; struct inet6_dev *idev; struct in6_addr *addr; int accept_seg6; hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); idev = __in6_dev_get(skb->dev); accept_seg6 = net->ipv6.devconf_all->seg6_enabled; if (accept_seg6 > idev->cnf.seg6_enabled) accept_seg6 = idev->cnf.seg6_enabled; if (!accept_seg6) { kfree_skb(skb); return -1; } #ifdef CONFIG_IPV6_SEG6_HMAC if (!seg6_hmac_validate_skb(skb)) { kfree_skb(skb); return -1; } #endif looped_back: if (hdr->segments_left == 0) { if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) { int offset = (hdr->hdrlen + 1) << 3; skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); if (!pskb_pull(skb, offset)) { kfree_skb(skb); return -1; } skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; if (hdr->nexthdr == NEXTHDR_IPV4) skb->protocol = htons(ETH_P_IP); __skb_tunnel_rx(skb, skb->dev, net); netif_rx(skb); return -1; } opt->srcrt = skb_network_header_len(skb); opt->lastopt = opt->srcrt; skb->transport_header += (hdr->hdrlen + 1) << 3; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } if (hdr->segments_left >= (hdr->hdrlen >> 1)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; } } hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); hdr->segments_left--; addr = hdr->segments + hdr->segments_left; skb_push(skb, sizeof(struct ipv6hdr)); if (skb->ip_summed == CHECKSUM_COMPLETE) seg6_update_csum(skb); ipv6_hdr(skb)->daddr = *addr; skb_dst_drop(skb); ip6_route_input(skb); if (skb_dst(skb)->error) { dst_input(skb); return -1; } if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; skb_pull(skb, sizeof(struct ipv6hdr)); goto looped_back; } dst_input(skb); return -1; } static int ipv6_rpl_srh_rcv(struct sk_buff *skb) { struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr; struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); struct inet6_dev *idev; struct ipv6hdr *oldhdr; struct in6_addr addr; unsigned char *buf; int accept_rpl_seg; int i, err; u64 n = 0; u32 r; idev = __in6_dev_get(skb->dev); accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled; if (accept_rpl_seg > idev->cnf.rpl_seg_enabled) accept_rpl_seg = idev->cnf.rpl_seg_enabled; if (!accept_rpl_seg) { kfree_skb(skb); return -1; } looped_back: hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb); if (hdr->segments_left == 0) { if (hdr->nexthdr == NEXTHDR_IPV6) { int offset = (hdr->hdrlen + 1) << 3; skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); if (!pskb_pull(skb, offset)) { kfree_skb(skb); return -1; } skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; __skb_tunnel_rx(skb, skb->dev, net); netif_rx(skb); return -1; } opt->srcrt = skb_network_header_len(skb); opt->lastopt = opt->srcrt; skb->transport_header += (hdr->hdrlen + 1) << 3; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } if (!pskb_may_pull(skb, sizeof(*hdr))) { kfree_skb(skb); return -1; } n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre); r = do_div(n, (16 - hdr->cmpri)); /* checks if calculation was without remainder and n fits into * unsigned char which is segments_left field. Should not be * higher than that. */ if (r || (n + 1) > 255) { kfree_skb(skb); return -1; } if (hdr->segments_left > n + 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } if (!pskb_may_pull(skb, ipv6_rpl_srh_size(n, hdr->cmpri, hdr->cmpre))) { kfree_skb(skb); return -1; } hdr->segments_left--; i = n - hdr->segments_left; buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC); if (unlikely(!buf)) { kfree_skb(skb); return -1; } ohdr = (struct ipv6_rpl_sr_hdr *)buf; ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n); chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3)); if ((ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST) || (ipv6_addr_type(&ohdr->rpl_segaddr[i]) & IPV6_ADDR_MULTICAST)) { kfree_skb(skb); kfree(buf); return -1; } err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1); if (err) { icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0); kfree_skb(skb); kfree(buf); return -1; } addr = ipv6_hdr(skb)->daddr; ipv6_hdr(skb)->daddr = ohdr->rpl_segaddr[i]; ohdr->rpl_segaddr[i] = addr; ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n); oldhdr = ipv6_hdr(skb); skb_pull(skb, ((hdr->hdrlen + 1) << 3)); skb_postpull_rcsum(skb, oldhdr, sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3)); if (unlikely(!hdr->segments_left)) { if (pskb_expand_head(skb, sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3), 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); kfree(buf); return -1; } oldhdr = ipv6_hdr(skb); } skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr)); memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3); ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, ipv6_hdr(skb), sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3)); kfree(buf); skb_dst_drop(skb); ip6_route_input(skb); if (skb_dst(skb)->error) { dst_input(skb); return -1; } if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; skb_pull(skb, sizeof(struct ipv6hdr)); goto looped_back; } dst_input(skb); return -1; } /******************************** Routing header. ********************************/ /* called with rcu_read_lock() */ static int ipv6_rthdr_rcv(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; struct in6_addr daddr; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; struct net *net = dev_net(skb->dev); int accept_source_route = net->ipv6.devconf_all->accept_source_route; idev = __in6_dev_get(skb->dev); if (idev && accept_source_route > idev->cnf.accept_source_route) accept_source_route = idev->cnf.accept_source_route; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } switch (hdr->type) { case IPV6_SRCRT_TYPE_4: /* segment routing */ return ipv6_srh_rcv(skb); case IPV6_SRCRT_TYPE_3: /* rpl segment routing */ return ipv6_rpl_srh_rcv(skb); default: break; } looped_back: if (hdr->segments_left == 0) { switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: /* Silently discard type 2 header unless it was * processed by own */ if (!addr) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } break; #endif default: break; } opt->lastopt = opt->srcrt = skb_network_header_len(skb); skb->transport_header += (hdr->hdrlen + 1) << 3; opt->dst0 = opt->dst1; opt->dst1 = 0; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (accept_source_route < 0) goto unknown_rh; /* Silently discard invalid RTH type 2 */ if (hdr->hdrlen != 2 || hdr->segments_left != 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } break; #endif default: goto unknown_rh; } /* * This is the routing header forwarding algorithm from * RFC 2460, page 16. */ n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } /* We are about to mangle packet header. Be careful! Do not damage packets queued somewhere. */ if (skb_cloned(skb)) { /* the copy is a forwarded packet */ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; } hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); } if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; i = n - --hdr->segments_left; rthdr = (struct rt0_hdr *) hdr; addr = rthdr->addr; addr += i - 1; switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } break; #endif default: break; } if (ipv6_addr_is_multicast(addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } daddr = *addr; *addr = ipv6_hdr(skb)->daddr; ipv6_hdr(skb)->daddr = daddr; skb_dst_drop(skb); ip6_route_input(skb); if (skb_dst(skb)->error) { skb_push(skb, skb->data - skb_network_header(skb)); dst_input(skb); return -1; } if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; goto looped_back; } skb_push(skb, skb->data - skb_network_header(skb)); dst_input(skb); return -1; unknown_rh: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb_network_header(skb)); return -1; } static const struct inet6_protocol rthdr_protocol = { .handler = ipv6_rthdr_rcv, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol destopt_protocol = { .handler = ipv6_destopt_rcv, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol nodata_protocol = { .handler = dst_discard, .flags = INET6_PROTO_NOPOLICY, }; int __init ipv6_exthdrs_init(void) { int ret; ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING); if (ret) goto out; ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS); if (ret) goto out_rthdr; ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE); if (ret) goto out_destopt; out: return ret; out_destopt: inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); out_rthdr: inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); goto out; }; void ipv6_exthdrs_exit(void) { inet6_del_protocol(&nodata_protocol, IPPROTO_NONE); inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); } /********************************** Hop-by-hop options. **********************************/ /* * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input(). */ static inline struct net *ipv6_skb_net(struct sk_buff *skb) { return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); } /* Router Alert as of RFC 2711 */ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] == 2) { IP6CB(skb)->flags |= IP6SKB_ROUTERALERT; memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra)); return true; } net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n", nh[optoff + 1]); kfree_skb(skb); return false; } /* IOAM */ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) { struct ioam6_trace_hdr *trace; struct ioam6_namespace *ns; struct ioam6_hdr *hdr; /* Bad alignment (must be 4n-aligned) */ if (optoff & 3) goto drop; /* Ignore if IOAM is not enabled on ingress */ if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled) goto ignore; /* Truncated Option header */ hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff); if (hdr->opt_len < 2) goto drop; switch (hdr->type) { case IOAM6_TYPE_PREALLOC: /* Truncated Pre-allocated Trace header */ if (hdr->opt_len < 2 + sizeof(*trace)) goto drop; /* Malformed Pre-allocated Trace header */ trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr)); if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4) goto drop; /* Ignore if the IOAM namespace is unknown */ ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id); if (!ns) goto ignore; if (!skb_valid_dst(skb)) ip6_route_input(skb); ioam6_fill_trace_data(skb, ns, trace); break; default: break; } ignore: return true; drop: kfree_skb(skb); return false; } /* Jumbo payload */ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); struct net *net = ipv6_skb_net(skb); u32 pkt_len; if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", nh[optoff+1]); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); goto drop; } pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); return false; } if (ipv6_hdr(skb)->payload_len) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto drop; IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM; return true; drop: kfree_skb(skb); return false; } /* CALIPSO RFC 5570 */ static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] < 8) goto drop; if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1]) goto drop; if (!calipso_validate(skb, nh + optoff)) goto drop; return true; drop: kfree_skb(skb); return false; } int ipv6_parse_hopopts(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); int extlen; /* * skb_network_header(skb) is equal to skb->data, and * skb_network_header_len(skb) is always equal to * sizeof(struct ipv6hdr) by definition of * hop-by-hop options. */ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || !pskb_may_pull(skb, (sizeof(struct ipv6hdr) + ((skb_transport_header(skb)[1] + 1) << 3)))) { fail_and_free: kfree_skb(skb); return -1; } extlen = (skb_transport_header(skb)[1] + 1) << 3; if (extlen > net->ipv6.sysctl.max_hbh_opts_len) goto fail_and_free; opt->flags |= IP6SKB_HOPBYHOP; if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); return 1; } return -1; } /* * Creating outbound headers. * * "build" functions work when skb is filled from head to tail (datagram) * "push" functions work when headers are added from tail to head (tcp) * * In both cases we assume, that caller reserved enough room * for headers. */ static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { struct rt0_hdr *phdr, *ihdr; int hops; ihdr = (struct rt0_hdr *) opt; phdr = skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); memcpy(phdr, ihdr, sizeof(struct rt0_hdr)); hops = ihdr->rt_hdr.hdrlen >> 1; if (hops > 1) memcpy(phdr->addr, ihdr->addr + 1, (hops - 1) * sizeof(struct in6_addr)); phdr->addr[hops - 1] = **addr_p; *addr_p = ihdr->addr; phdr->rt_hdr.nexthdr = *proto; *proto = NEXTHDR_ROUTING; } static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { struct ipv6_sr_hdr *sr_phdr, *sr_ihdr; int plen, hops; sr_ihdr = (struct ipv6_sr_hdr *)opt; plen = (sr_ihdr->hdrlen + 1) << 3; sr_phdr = skb_push(skb, plen); memcpy(sr_phdr, sr_ihdr, sizeof(struct ipv6_sr_hdr)); hops = sr_ihdr->first_segment + 1; memcpy(sr_phdr->segments + 1, sr_ihdr->segments + 1, (hops - 1) * sizeof(struct in6_addr)); sr_phdr->segments[0] = **addr_p; *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left]; if (sr_ihdr->hdrlen > hops * 2) { int tlvs_offset, tlvs_length; tlvs_offset = (1 + hops * 2) << 3; tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3; memcpy((char *)sr_phdr + tlvs_offset, (char *)sr_ihdr + tlvs_offset, tlvs_length); } #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(sr_phdr)) { struct net *net = NULL; if (skb->dev) net = dev_net(skb->dev); else if (skb->sk) net = sock_net(skb->sk); WARN_ON(!net); if (net) seg6_push_hmac(net, saddr, sr_phdr); } #endif sr_phdr->nexthdr = *proto; *proto = NEXTHDR_ROUTING; } static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { switch (opt->type) { case IPV6_SRCRT_TYPE_0: case IPV6_SRCRT_STRICT: case IPV6_SRCRT_TYPE_2: ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr); break; case IPV6_SRCRT_TYPE_4: ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr); break; default: break; } } static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) { struct ipv6_opt_hdr *h = skb_push(skb, ipv6_optlen(opt)); memcpy(h, opt, ipv6_optlen(opt)); h->nexthdr = *proto; *proto = type; } void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto, struct in6_addr **daddr, struct in6_addr *saddr) { if (opt->srcrt) { ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr); /* * IPV6_RTHDRDSTOPTS is ignored * unless IPV6_RTHDR is set (RFC3542). */ if (opt->dst0opt) ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); } if (opt->hopopt) ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); } void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) { if (opt->dst1opt) ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); } EXPORT_SYMBOL(ipv6_push_frag_opts); struct ipv6_txoptions * ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) { struct ipv6_txoptions *opt2; opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); if (opt2) { long dif = (char *)opt2 - (char *)opt; memcpy(opt2, opt, opt->tot_len); if (opt2->hopopt) *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) *((char **)&opt2->dst0opt) += dif; if (opt2->dst1opt) *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) *((char **)&opt2->srcrt) += dif; refcount_set(&opt2->refcnt, 1); } return opt2; } EXPORT_SYMBOL_GPL(ipv6_dup_options); static void ipv6_renew_option(int renewtype, struct ipv6_opt_hdr **dest, struct ipv6_opt_hdr *old, struct ipv6_opt_hdr *new, int newtype, char **p) { struct ipv6_opt_hdr *src; src = (renewtype == newtype ? new : old); if (!src) return; memcpy(*p, src, ipv6_optlen(src)); *dest = (struct ipv6_opt_hdr *)*p; *p += CMSG_ALIGN(ipv6_optlen(*dest)); } /** * ipv6_renew_options - replace a specific ext hdr with a new one. * * @sk: sock from which to allocate memory * @opt: original options * @newtype: option type to replace in @opt * @newopt: new option of type @newtype to replace (user-mem) * * Returns a new set of options which is a copy of @opt with the * option type @newtype replaced with @newopt. * * @opt may be NULL, in which case a new set of options is returned * containing just @newopt. * * @newopt may be NULL, in which case the specified option type is * not copied into the new set of options. * * The new set of options is allocated from the socket option memory * buffer of @sk. */ struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt) { int tot_len = 0; char *p; struct ipv6_txoptions *opt2; if (opt) { if (newtype != IPV6_HOPOPTS && opt->hopopt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt)); if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt)); if (newtype != IPV6_RTHDR && opt->srcrt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt)); if (newtype != IPV6_DSTOPTS && opt->dst1opt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt)); } if (newopt) tot_len += CMSG_ALIGN(ipv6_optlen(newopt)); if (!tot_len) return NULL; tot_len += sizeof(*opt2); opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC); if (!opt2) return ERR_PTR(-ENOBUFS); memset(opt2, 0, tot_len); refcount_set(&opt2->refcnt, 1); opt2->tot_len = tot_len; p = (char *)(opt2 + 1); ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt, (opt ? opt->hopopt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt, (opt ? opt->dst0opt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_RTHDR, (struct ipv6_opt_hdr **)&opt2->srcrt, (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt, (opt ? opt->dst1opt : NULL), newopt, newtype, &p); opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) + (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) + (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0); opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0); return opt2; } struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) { /* * ignore the dest before srcrt unless srcrt is being included. * --yoshfuji */ if (opt && opt->dst0opt && !opt->srcrt) { if (opt_space != opt) { memcpy(opt_space, opt, sizeof(*opt_space)); opt = opt_space; } opt->opt_nflen -= ipv6_optlen(opt->dst0opt); opt->dst0opt = NULL; } return opt; } EXPORT_SYMBOL_GPL(ipv6_fixup_options); /** * fl6_update_dst - update flowi destination address with info given * by srcrt option, if any. * * @fl6: flowi6 for which daddr is to be updated * @opt: struct ipv6_txoptions in which to look for srcrt opt * @orig: copy of original daddr address if modified * * Returns NULL if no txoptions or no srcrt, otherwise returns orig * and initial value of fl6->daddr set in orig */ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, struct in6_addr *orig) { if (!opt || !opt->srcrt) return NULL; *orig = fl6->daddr; switch (opt->srcrt->type) { case IPV6_SRCRT_TYPE_0: case IPV6_SRCRT_STRICT: case IPV6_SRCRT_TYPE_2: fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr; break; case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; fl6->daddr = srh->segments[srh->segments_left]; break; } default: return NULL; } return orig; } EXPORT_SYMBOL_GPL(fl6_update_dst); |
5 6 6 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #ifndef _WG_QUEUEING_H #define _WG_QUEUEING_H #include "peer.h" #include <linux/types.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip_tunnels.h> struct wg_device; struct wg_peer; struct multicore_worker; struct crypt_queue; struct prev_queue; struct sk_buff; /* queueing.c APIs: */ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, unsigned int len); void wg_packet_queue_free(struct crypt_queue *queue, bool purge); struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); /* receive.c APIs: */ void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb); void wg_packet_handshake_receive_worker(struct work_struct *work); /* NAPI poll function: */ int wg_packet_rx_poll(struct napi_struct *napi, int budget); /* Workqueue worker: */ void wg_packet_decrypt_worker(struct work_struct *work); /* send.c APIs: */ void wg_packet_send_queued_handshake_initiation(struct wg_peer *peer, bool is_retry); void wg_packet_send_handshake_response(struct wg_peer *peer); void wg_packet_send_handshake_cookie(struct wg_device *wg, struct sk_buff *initiating_skb, __le32 sender_index); void wg_packet_send_keepalive(struct wg_peer *peer); void wg_packet_purge_staged_packets(struct wg_peer *peer); void wg_packet_send_staged_packets(struct wg_peer *peer); /* Workqueue workers: */ void wg_packet_handshake_send_worker(struct work_struct *work); void wg_packet_tx_worker(struct work_struct *work); void wg_packet_encrypt_worker(struct work_struct *work); enum packet_state { PACKET_STATE_UNCRYPTED, PACKET_STATE_CRYPTED, PACKET_STATE_DEAD }; struct packet_cb { u64 nonce; struct noise_keypair *keypair; atomic_t state; u32 mtu; u8 ds; }; #define PACKET_CB(skb) ((struct packet_cb *)((skb)->cb)) #define PACKET_PEER(skb) (PACKET_CB(skb)->keypair->entry.peer) static inline bool wg_check_packet_protocol(struct sk_buff *skb) { __be16 real_protocol = ip_tunnel_parse_protocol(skb); return real_protocol && skb->protocol == real_protocol; } static inline void wg_reset_packet(struct sk_buff *skb, bool encapsulating) { u8 l4_hash = skb->l4_hash; u8 sw_hash = skb->sw_hash; u32 hash = skb->hash; skb_scrub_packet(skb, true); memset(&skb->headers_start, 0, offsetof(struct sk_buff, headers_end) - offsetof(struct sk_buff, headers_start)); if (encapsulating) { skb->l4_hash = l4_hash; skb->sw_hash = sw_hash; skb->hash = hash; } skb->queue_mapping = 0; skb->nohdr = 0; skb->peeked = 0; skb->mac_len = 0; skb->dev = NULL; #ifdef CONFIG_NET_SCHED skb->tc_index = 0; #endif skb_reset_redirect(skb); skb->hdr_len = skb_headroom(skb); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_probe_transport_header(skb); skb_reset_inner_headers(skb); } static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id) { unsigned int cpu = *stored_cpu, cpu_index, i; if (unlikely(cpu == nr_cpumask_bits || !cpumask_test_cpu(cpu, cpu_online_mask))) { cpu_index = id % cpumask_weight(cpu_online_mask); cpu = cpumask_first(cpu_online_mask); for (i = 0; i < cpu_index; ++i) cpu = cpumask_next(cpu, cpu_online_mask); *stored_cpu = cpu; } return cpu; } /* This function is racy, in the sense that it's called while last_cpu is * unlocked, so it could return the same CPU twice. Adding locking or using * atomic sequence numbers is slower though, and the consequences of racing are * harmless, so live with it. */ static inline int wg_cpumask_next_online(int *last_cpu) { int cpu = cpumask_next(READ_ONCE(*last_cpu), cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_online_mask); WRITE_ONCE(*last_cpu, cpu); return cpu; } void wg_prev_queue_init(struct prev_queue *queue); /* Multi producer */ bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb); /* Single consumer */ struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue); /* Single consumer */ static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue) { if (queue->peeked) return queue->peeked; queue->peeked = wg_prev_queue_dequeue(queue); return queue->peeked; } /* Single consumer */ static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) { queue->peeked = NULL; } static inline int wg_queue_enqueue_per_device_and_peer( struct crypt_queue *device_queue, struct prev_queue *peer_queue, struct sk_buff *skb, struct workqueue_struct *wq) { int cpu; atomic_set_release(&PACKET_CB(skb)->state, PACKET_STATE_UNCRYPTED); /* We first queue this up for the peer ingestion, but the consumer * will wait for the state to change to CRYPTED or DEAD before. */ if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb))) return -ENOSPC; /* Then we queue it up in the device queue, which consumes the * packet as soon as it can. */ cpu = wg_cpumask_next_online(&device_queue->last_cpu); if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb))) return -EPIPE; queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work); return 0; } static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. */ struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), peer->device->packet_crypt_wq, &peer->transmit_packet_work); wg_peer_put(peer); } static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. */ struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); napi_schedule(&peer->napi); wg_peer_put(peer); } #ifdef DEBUG bool wg_packet_counter_selftest(void); #endif #endif /* _WG_QUEUEING_H */ |
50 50 3 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_est.c: simple rate estimator for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * Affected data: est_list and est_lock. * estimation_timer() runs with timer per netns. * get_stats()) do the per cpu summing. */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/list.h> #include <net/ip_vs.h> /* This code is to estimate rate in a shorter interval (such as 8 seconds) for virtual services and real servers. For measure rate in a long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. Currently, the measurement is activated by slow timer handler. Hope this measurement will not introduce too much load. We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W where W = 2^(-2) NOTES. * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. * Netlink users can see 64-bit values but sockopt users are restricted to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c */ /* * Make a summary from each cpu */ static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, struct ip_vs_cpu_stats __percpu *stats) { int i; bool add = false; for_each_possible_cpu(i) { struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); unsigned int start; u64 conns, inpkts, outpkts, inbytes, outbytes; if (add) { do { start = u64_stats_fetch_begin(&s->syncp); conns = s->cnt.conns; inpkts = s->cnt.inpkts; outpkts = s->cnt.outpkts; inbytes = s->cnt.inbytes; outbytes = s->cnt.outbytes; } while (u64_stats_fetch_retry(&s->syncp, start)); sum->conns += conns; sum->inpkts += inpkts; sum->outpkts += outpkts; sum->inbytes += inbytes; sum->outbytes += outbytes; } else { add = true; do { start = u64_stats_fetch_begin(&s->syncp); sum->conns = s->cnt.conns; sum->inpkts = s->cnt.inpkts; sum->outpkts = s->cnt.outpkts; sum->inbytes = s->cnt.inbytes; sum->outbytes = s->cnt.outbytes; } while (u64_stats_fetch_retry(&s->syncp, start)); } } } static void estimation_timer(struct timer_list *t) { struct ip_vs_estimator *e; struct ip_vs_stats *s; u64 rate; struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); spin_lock(&s->lock); ip_vs_read_cpu_stats(&s->kstats, s->cpustats); /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; e->last_conns = s->kstats.conns; e->cps += ((s64)rate - (s64)e->cps) >> 2; rate = (s->kstats.inpkts - e->last_inpkts) << 9; e->last_inpkts = s->kstats.inpkts; e->inpps += ((s64)rate - (s64)e->inpps) >> 2; rate = (s->kstats.outpkts - e->last_outpkts) << 9; e->last_outpkts = s->kstats.outpkts; e->outpps += ((s64)rate - (s64)e->outpps) >> 2; /* scaled by 2^5, but divided 2 seconds */ rate = (s->kstats.inbytes - e->last_inbytes) << 4; e->last_inbytes = s->kstats.inbytes; e->inbps += ((s64)rate - (s64)e->inbps) >> 2; rate = (s->kstats.outbytes - e->last_outbytes) << 4; e->last_outbytes = s->kstats.outbytes; e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } spin_unlock(&ipvs->est_lock); mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; INIT_LIST_HEAD(&est->list); spin_lock_bh(&ipvs->est_lock); list_add(&est->list, &ipvs->est_list); spin_unlock_bh(&ipvs->est_lock); } void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; spin_lock_bh(&ipvs->est_lock); list_del(&est->list); spin_unlock_bh(&ipvs->est_lock); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_kstats *k = &stats->kstats; /* reset counters, caller must hold the stats->lock lock */ est->last_inbytes = k->inbytes; est->last_outbytes = k->outbytes; est->last_conns = k->conns; est->last_inpkts = k->inpkts; est->last_outpkts = k->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; est->inbps = 0; est->outbps = 0; } /* Get decoded rates */ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) { struct ip_vs_estimator *e = &stats->est; dst->cps = (e->cps + 0x1FF) >> 10; dst->inpps = (e->inpps + 0x1FF) >> 10; dst->outpps = (e->outpps + 0x1FF) >> 10; dst->inbps = (e->inbps + 0xF) >> 5; dst->outbps = (e->outbps + 0xF) >> 5; } int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { INIT_LIST_HEAD(&ipvs->est_list); spin_lock_init(&ipvs->est_lock); timer_setup(&ipvs->est_timer, estimation_timer, 0); mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { del_timer_sync(&ipvs->est_timer); } |
889 889 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 | // SPDX-License-Identifier: GPL-2.0-or-later /* * LAPB release 002 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * History * LAPB 001 Jonathan Naylor Started Coding * LAPB 002 Jonathan Naylor New timer architecture. * 2000-10-29 Henner Eisen lapb_data_indication() return status. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/stat.h> #include <linux/init.h> #include <net/lapb.h> static LIST_HEAD(lapb_list); static DEFINE_RWLOCK(lapb_list_lock); /* * Free an allocated lapb control block. */ static void lapb_free_cb(struct lapb_cb *lapb) { kfree(lapb); } static __inline__ void lapb_hold(struct lapb_cb *lapb) { refcount_inc(&lapb->refcnt); } static __inline__ void lapb_put(struct lapb_cb *lapb) { if (refcount_dec_and_test(&lapb->refcnt)) lapb_free_cb(lapb); } /* * Socket removal during an interrupt is now safe. */ static void __lapb_remove_cb(struct lapb_cb *lapb) { if (lapb->node.next) { list_del(&lapb->node); lapb_put(lapb); } } /* * Add a socket to the bound sockets list. */ static void __lapb_insert_cb(struct lapb_cb *lapb) { list_add(&lapb->node, &lapb_list); lapb_hold(lapb); } static struct lapb_cb *__lapb_devtostruct(struct net_device *dev) { struct lapb_cb *lapb, *use = NULL; list_for_each_entry(lapb, &lapb_list, node) { if (lapb->dev == dev) { use = lapb; break; } } if (use) lapb_hold(use); return use; } static struct lapb_cb *lapb_devtostruct(struct net_device *dev) { struct lapb_cb *rc; read_lock_bh(&lapb_list_lock); rc = __lapb_devtostruct(dev); read_unlock_bh(&lapb_list_lock); return rc; } /* * Create an empty LAPB control block. */ static struct lapb_cb *lapb_create_cb(void) { struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC); if (!lapb) goto out; skb_queue_head_init(&lapb->write_queue); skb_queue_head_init(&lapb->ack_queue); timer_setup(&lapb->t1timer, NULL, 0); timer_setup(&lapb->t2timer, NULL, 0); lapb->t1timer_running = false; lapb->t2timer_running = false; lapb->t1 = LAPB_DEFAULT_T1; lapb->t2 = LAPB_DEFAULT_T2; lapb->n2 = LAPB_DEFAULT_N2; lapb->mode = LAPB_DEFAULT_MODE; lapb->window = LAPB_DEFAULT_WINDOW; lapb->state = LAPB_STATE_0; spin_lock_init(&lapb->lock); refcount_set(&lapb->refcnt, 1); out: return lapb; } int lapb_register(struct net_device *dev, const struct lapb_register_struct *callbacks) { struct lapb_cb *lapb; int rc = LAPB_BADTOKEN; write_lock_bh(&lapb_list_lock); lapb = __lapb_devtostruct(dev); if (lapb) { lapb_put(lapb); goto out; } lapb = lapb_create_cb(); rc = LAPB_NOMEM; if (!lapb) goto out; lapb->dev = dev; lapb->callbacks = callbacks; __lapb_insert_cb(lapb); lapb_start_t1timer(lapb); rc = LAPB_OK; out: write_unlock_bh(&lapb_list_lock); return rc; } EXPORT_SYMBOL(lapb_register); int lapb_unregister(struct net_device *dev) { struct lapb_cb *lapb; int rc = LAPB_BADTOKEN; write_lock_bh(&lapb_list_lock); lapb = __lapb_devtostruct(dev); if (!lapb) goto out; lapb_put(lapb); /* Wait for other refs to "lapb" to drop */ while (refcount_read(&lapb->refcnt) > 2) usleep_range(1, 10); spin_lock_bh(&lapb->lock); lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); lapb_clear_queues(lapb); spin_unlock_bh(&lapb->lock); /* Wait for running timers to stop */ del_timer_sync(&lapb->t1timer); del_timer_sync(&lapb->t2timer); __lapb_remove_cb(lapb); lapb_put(lapb); rc = LAPB_OK; out: write_unlock_bh(&lapb_list_lock); return rc; } EXPORT_SYMBOL(lapb_unregister); int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) { int rc = LAPB_BADTOKEN; struct lapb_cb *lapb = lapb_devtostruct(dev); if (!lapb) goto out; spin_lock_bh(&lapb->lock); parms->t1 = lapb->t1 / HZ; parms->t2 = lapb->t2 / HZ; parms->n2 = lapb->n2; parms->n2count = lapb->n2count; parms->state = lapb->state; parms->window = lapb->window; parms->mode = lapb->mode; if (!timer_pending(&lapb->t1timer)) parms->t1timer = 0; else parms->t1timer = (lapb->t1timer.expires - jiffies) / HZ; if (!timer_pending(&lapb->t2timer)) parms->t2timer = 0; else parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ; spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; out: return rc; } EXPORT_SYMBOL(lapb_getparms); int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) { int rc = LAPB_BADTOKEN; struct lapb_cb *lapb = lapb_devtostruct(dev); if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_INVALUE; if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1) goto out_put; if (lapb->state == LAPB_STATE_0) { if (parms->mode & LAPB_EXTENDED) { if (parms->window < 1 || parms->window > 127) goto out_put; } else { if (parms->window < 1 || parms->window > 7) goto out_put; } lapb->mode = parms->mode; lapb->window = parms->window; } lapb->t1 = parms->t1 * HZ; lapb->t2 = parms->t2 * HZ; lapb->n2 = parms->n2; rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_setparms); int lapb_connect_request(struct net_device *dev) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_OK; if (lapb->state == LAPB_STATE_1) goto out_put; rc = LAPB_CONNECTED; if (lapb->state == LAPB_STATE_3 || lapb->state == LAPB_STATE_4) goto out_put; lapb_establish_data_link(lapb); lapb_dbg(0, "(%p) S0 -> S1\n", lapb->dev); lapb->state = LAPB_STATE_1; rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_connect_request); static int __lapb_disconnect_request(struct lapb_cb *lapb) { switch (lapb->state) { case LAPB_STATE_0: return LAPB_NOTCONNECTED; case LAPB_STATE_1: lapb_dbg(1, "(%p) S1 TX DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev); lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb->state = LAPB_STATE_0; lapb_start_t1timer(lapb); return LAPB_NOTCONNECTED; case LAPB_STATE_2: return LAPB_OK; } lapb_clear_queues(lapb); lapb->n2count = 0; lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb_start_t1timer(lapb); lapb_stop_t2timer(lapb); lapb->state = LAPB_STATE_2; lapb_dbg(1, "(%p) S3 DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S3 -> S2\n", lapb->dev); return LAPB_OK; } int lapb_disconnect_request(struct net_device *dev) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = __lapb_disconnect_request(lapb); spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_disconnect_request); int lapb_data_request(struct net_device *dev, struct sk_buff *skb) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_NOTCONNECTED; if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4) goto out_put; skb_queue_tail(&lapb->write_queue, skb); lapb_kick(lapb); rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_data_request); int lapb_data_received(struct net_device *dev, struct sk_buff *skb) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (lapb) { spin_lock_bh(&lapb->lock); lapb_data_input(lapb, skb); spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; } return rc; } EXPORT_SYMBOL(lapb_data_received); void lapb_connect_confirmation(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->connect_confirmation) lapb->callbacks->connect_confirmation(lapb->dev, reason); } void lapb_connect_indication(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->connect_indication) lapb->callbacks->connect_indication(lapb->dev, reason); } void lapb_disconnect_confirmation(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->disconnect_confirmation) lapb->callbacks->disconnect_confirmation(lapb->dev, reason); } void lapb_disconnect_indication(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->disconnect_indication) lapb->callbacks->disconnect_indication(lapb->dev, reason); } int lapb_data_indication(struct lapb_cb *lapb, struct sk_buff *skb) { if (lapb->callbacks->data_indication) return lapb->callbacks->data_indication(lapb->dev, skb); kfree_skb(skb); return NET_RX_SUCCESS; /* For now; must be != NET_RX_DROP */ } int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) { int used = 0; if (lapb->callbacks->data_transmit) { lapb->callbacks->data_transmit(lapb->dev, skb); used = 1; } return used; } /* Handle device status changes. */ static int lapb_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct lapb_cb *lapb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type != ARPHRD_X25) return NOTIFY_DONE; lapb = lapb_devtostruct(dev); if (!lapb) return NOTIFY_DONE; spin_lock_bh(&lapb->lock); switch (event) { case NETDEV_UP: lapb_dbg(0, "(%p) Interface up: %s\n", dev, dev->name); if (netif_carrier_ok(dev)) { lapb_dbg(0, "(%p): Carrier is already up: %s\n", dev, dev->name); if (lapb->mode & LAPB_DCE) { lapb_start_t1timer(lapb); } else { if (lapb->state == LAPB_STATE_0) { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } } } break; case NETDEV_GOING_DOWN: if (netif_carrier_ok(dev)) __lapb_disconnect_request(lapb); break; case NETDEV_DOWN: lapb_dbg(0, "(%p) Interface down: %s\n", dev, dev->name); lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb->n2count = 0; lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); break; case NETDEV_CHANGE: if (netif_carrier_ok(dev)) { lapb_dbg(0, "(%p): Carrier detected: %s\n", dev, dev->name); if (lapb->mode & LAPB_DCE) { lapb_start_t1timer(lapb); } else { if (lapb->state == LAPB_STATE_0) { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } } } else { lapb_dbg(0, "(%p) Carrier lost: %s\n", dev, dev->name); lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb->n2count = 0; lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); } break; } spin_unlock_bh(&lapb->lock); lapb_put(lapb); return NOTIFY_DONE; } static struct notifier_block lapb_dev_notifier = { .notifier_call = lapb_device_event, }; static int __init lapb_init(void) { return register_netdevice_notifier(&lapb_dev_notifier); } static void __exit lapb_exit(void) { WARN_ON(!list_empty(&lapb_list)); unregister_netdevice_notifier(&lapb_dev_notifier); } MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Link Access Procedure B link layer protocol"); MODULE_LICENSE("GPL"); module_init(lapb_init); module_exit(lapb_exit); |
105 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | // SPDX-License-Identifier: GPL-2.0 /* RTT/RTO calculation. * * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com) * * https://tools.ietf.org/html/rfc6298 * https://tools.ietf.org/html/rfc1122#section-4.2.3.1 * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf */ #include <linux/net.h> #include "ar-internal.h" #define RXRPC_RTO_MAX ((unsigned)(120 * HZ)) #define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) { return 200; } static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) { return usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); } static u32 rxrpc_bound_rto(u32 rto) { return min(rto, RXRPC_RTO_MAX); } /* * Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ u32 srtt = peer->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (peer->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (peer->mdev_us >> 2); /* similar update on mdev */ } peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (peer->mdev_us > peer->mdev_max_us) { peer->mdev_max_us = peer->mdev_us; if (peer->mdev_max_us > peer->rttvar_us) peer->rttvar_us = peer->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); peer->mdev_max_us = peer->rttvar_us; } peer->srtt_us = max(1U, srtt); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ static void rxrpc_set_rto(struct rxrpc_peer *peer) { u32 rto; /* 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ rto = __rxrpc_set_rto(peer); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ peer->rto_j = rxrpc_bound_rto(rto); } static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) { if (rtt_us < 0) return; //rxrpc_update_rtt_min(peer, rtt_us); rxrpc_rtt_estimator(peer, rtt_us); rxrpc_set_rto(peer); /* RFC6298: only reset backoff on valid RTT measurement. */ peer->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has * exclusive access to the peer RTT data. */ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { struct rxrpc_peer *peer = call->peer; s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; spin_lock(&peer->rtt_input_lock); rxrpc_ack_update_rtt(peer, rtt_us); if (peer->rtt_count < 3) peer->rtt_count++; spin_unlock(&peer->rtt_input_lock); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, peer->srtt_us >> 3, peer->rto_j); } /* * Get the retransmission timeout to set in jiffies, backing it off each time * we retransmit. */ unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) { u64 timo_j; u8 backoff = READ_ONCE(peer->backoff); timo_j = peer->rto_j; timo_j <<= backoff; if (retrans && timo_j * 2 <= RXRPC_RTO_MAX) WRITE_ONCE(peer->backoff, backoff + 1); if (timo_j < 1) timo_j = 1; return timo_j; } void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) { peer->rto_j = RXRPC_TIMEOUT_INIT; peer->mdev_us = jiffies_to_usecs(RXRPC_TIMEOUT_INIT); peer->backoff = 0; //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); } |
165 164 166 166 7 7 162 161 162 162 163 163 1 162 162 1 162 162 162 15 15 15 15 15 15 15 15 15 15 15 15 15 14 14 14 14 14 14 157 156 1 15 15 15 15 15 15 7 7 7 7 7 7 169 16 168 1 169 169 167 166 170 169 165 59 150 151 170 59 170 155 156 151 15 11 15 11 15 15 15 16 16 15 14 14 14 14 14 16 16 16 16 16 16 16 16 16 16 15 15 15 15 15 15 15 15 2 10 10 15 15 15 1 14 15 15 189 187 189 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2002 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions work with the state functions in sctp_sm_statefuns.c * to implement the state operations. These functions implement the * steps which require modifying existing data structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * C. Robin <chris@hundredacre.ac.uk> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/skbuff.h> #include <linux/random.h> /* for get_random_bytes */ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len); static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp); static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data); /* Control chunk destructor */ static void sctp_control_release_owner(struct sk_buff *skb) { struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; if (chunk->shkey) { struct sctp_shared_key *shkey = chunk->shkey; struct sctp_association *asoc = chunk->asoc; /* refcnt == 2 and !list_empty mean after this release, it's * not being used anywhere, and it's time to notify userland * that this shkey can be freed if it's been deactivated. */ if (shkey->deactivated && !list_empty(&shkey->key_list) && refcount_read(&shkey->refcnt) == 2) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_auth_shkey_release(chunk->shkey); } } static void sctp_control_set_owner_w(struct sctp_chunk *chunk) { struct sctp_association *asoc = chunk->asoc; struct sk_buff *skb = chunk->skb; /* TODO: properly account for control chunks. * To do it right we'll need: * 1) endpoint if association isn't known. * 2) proper memory accounting. * * For now don't do anything for now. */ if (chunk->auth) { chunk->shkey = asoc->shkey; sctp_auth_shkey_hold(chunk->shkey); } skb->sk = asoc ? asoc->base.sk : NULL; skb_shinfo(skb)->destructor_arg = chunk; skb->destructor = sctp_control_release_owner; } /* What was the inbound interface for this chunk? */ int sctp_chunk_iif(const struct sctp_chunk *chunk) { struct sk_buff *skb = chunk->skb; return SCTP_INPUT_CB(skb)->af->skb_iif(skb); } /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 2: The ECN capable field is reserved for future use of * Explicit Congestion Notification. */ static const struct sctp_paramhdr ecap_param = { SCTP_PARAM_ECN_CAPABLE, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; static const struct sctp_paramhdr prsctp_param = { SCTP_PARAM_FWD_TSN_SUPPORT, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; /* A helper to initialize an op error inside a provided chunk, as most * cause codes will be embedded inside an abort chunk. */ int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, size_t paylen) { struct sctp_errhdr err; __u16 len; /* Cause code constants are now defined in network order. */ err.cause = cause_code; len = sizeof(err) + paylen; err.length = htons(len); if (skb_tailroom(chunk->skb) < len) return -ENOSPC; chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err); return 0; } /* 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two * endpoints. The format of the INIT chunk is shown below: * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initiate Tag | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Advertised Receiver Window Credit (a_rwnd) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of Outbound Streams | Number of Inbound Streams | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initial TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Optional/Variable-Length Parameters / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * The INIT chunk contains the following parameters. Unless otherwise * noted, each parameter MUST only be included once in the INIT chunk. * * Fixed Parameters Status * ---------------------------------------------- * Initiate Tag Mandatory * Advertised Receiver Window Credit Mandatory * Number of Outbound Streams Mandatory * Number of Inbound Streams Mandatory * Initial TSN Mandatory * * Variable Parameters Status Type Value * ------------------------------------------------------------- * IPv4 Address (Note 1) Optional 5 * IPv6 Address (Note 1) Optional 6 * Cookie Preservative Optional 9 * Reserved for ECN Capable (Note 2) Optional 32768 (0x8000) * Host Name Address (Note 3) Optional 11 * Supported Address Types (Note 4) Optional 12 */ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, const struct sctp_bind_addr *bp, gfp_t gfp, int vparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_supported_addrs_param sat; struct sctp_endpoint *ep = asoc->ep; struct sctp_chunk *retval = NULL; int num_types, addrs_len = 0; struct sctp_inithdr init; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; __be16 types[2]; int num_ext = 0; /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 1: The INIT chunks can contain multiple addresses that * can be IPv4 and/or IPv6 in any combination. */ /* Convert the provided bind address list to raw format. */ addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp); init.init_tag = htonl(asoc->c.my_vtag); init.a_rwnd = htonl(asoc->rwnd); init.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); init.num_inbound_streams = htons(asoc->c.sinit_max_instreams); init.initial_tsn = htonl(asoc->c.initial_tsn); /* How many address types are needed? */ sp = sctp_sk(asoc->base.sk); num_types = sp->pf->supported_addrs(sp, types); chunksize = sizeof(init) + addrs_len; chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); if (asoc->ep->ecn_enable) chunksize += sizeof(ecap_param); if (asoc->ep->prsctp_enable) chunksize += sizeof(prsctp_param); /* ADDIP: Section 4.2.7: * An implementation supporting this extension [ADDIP] MUST list * the ASCONF,the ASCONF-ACK, and the AUTH chunks in its INIT and * INIT-ACK parameters. */ if (asoc->ep->asconf_enable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->ep->reconf_enable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->ep->intl_enable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } chunksize += vparam_len; /* Account for AUTH related parameters */ if (ep->auth_enable) { /* Add random parameter length*/ chunksize += sizeof(asoc->c.auth_random); /* Add HMACS parameter length if any were defined */ auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; /* Add CHUNKS parameter length */ auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } /* If we have any extensions to report, account for that */ if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 3: An INIT chunk MUST NOT contain more than one Host * Name address parameter. Moreover, the sender of the INIT * MUST NOT combine any other address types with the Host Name * address in the INIT. The receiver of INIT MUST ignore any * other address types if the Host Name address parameter is * present in the received INIT chunk. * * PLEASE DO NOT FIXME [This version does not support Host Name.] */ retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp); if (!retval) goto nodata; retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(init), &init); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 4: This parameter, when present, specifies all the * address types the sending endpoint can support. The absence * of this parameter indicates that the sending endpoint can * support any address type. */ sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES; sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types)); sctp_addto_chunk(retval, sizeof(sat), &sat); sctp_addto_chunk(retval, num_types * sizeof(__u16), &types); if (asoc->ep->ecn_enable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); /* Add the supported extensions parameter. Be nice and add this * fist before addiding the parameters for the extensions themselves */ if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->ep->prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } /* Add SCTP-AUTH chunks to the parameter list */ if (ep->auth_enable) { sctp_addto_chunk(retval, sizeof(asoc->c.auth_random), asoc->c.auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } nodata: kfree(addrs.v); return retval; } struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, gfp_t gfp, int unkparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_random = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_chunk *retval = NULL; struct sctp_cookie_param *cookie; struct sctp_inithdr initack; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; int num_ext = 0; int cookie_len; int addrs_len; /* Note: there may be no addresses to embed. */ addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp); initack.init_tag = htonl(asoc->c.my_vtag); initack.a_rwnd = htonl(asoc->rwnd); initack.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); initack.num_inbound_streams = htons(asoc->c.sinit_max_instreams); initack.initial_tsn = htonl(asoc->c.initial_tsn); /* FIXME: We really ought to build the cookie right * into the packet instead of allocating more fresh memory. */ cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len, addrs.v, addrs_len); if (!cookie) goto nomem_cookie; /* Calculate the total size of allocation, include the reserved * space for reporting unknown parameters if it is specified. */ sp = sctp_sk(asoc->base.sk); chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; /* Tell peer that we'll do ECN only if peer advertised such cap. */ if (asoc->peer.ecn_capable) chunksize += sizeof(ecap_param); if (asoc->peer.prsctp_capable) chunksize += sizeof(prsctp_param); if (asoc->peer.asconf_capable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->peer.reconf_capable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->peer.intl_capable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } if (asoc->peer.auth_capable) { auth_random = (struct sctp_paramhdr *)asoc->c.auth_random; chunksize += ntohs(auth_random->length); auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); if (!retval) goto nomem_chunk; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * * [INIT ACK back to where the INIT came from.] */ if (chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(initack), &initack); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); sctp_addto_chunk(retval, cookie_len, cookie); if (asoc->peer.ecn_capable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->peer.prsctp_capable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } if (asoc->peer.auth_capable) { sctp_addto_chunk(retval, ntohs(auth_random->length), auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } /* We need to remove the const qualifier at this point. */ retval->asoc = (struct sctp_association *) asoc; nomem_chunk: kfree(cookie); nomem_cookie: kfree(addrs.v); return retval; } /* 3.3.11 Cookie Echo (COOKIE ECHO) (10): * * This chunk is used only during the initialization of an association. * It is sent by the initiator of an association to its peer to complete * the initialization process. This chunk MUST precede any DATA chunk * sent within the association, but MAY be bundled with one or more DATA * chunks in the same packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 10 |Chunk Flags | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / Cookie / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bit * * Set to zero on transmit and ignored on receipt. * * Length: 16 bits (unsigned integer) * * Set to the size of the chunk in bytes, including the 4 bytes of * the chunk header and the size of the Cookie. * * Cookie: variable size * * This field must contain the exact cookie received in the * State Cookie parameter from the previous INIT ACK. * * An implementation SHOULD make the cookie as small as possible * to insure interoperability. */ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; int cookie_len; void *cookie; cookie = asoc->peer.cookie; cookie_len = asoc->peer.cookie_len; /* Build a cookie echo chunk. */ retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.cookie_hdr = sctp_addto_chunk(retval, cookie_len, cookie); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ECHO back to where the INIT ACK came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11): * * This chunk is used only during the initialization of an * association. It is used to acknowledge the receipt of a COOKIE * ECHO chunk. This chunk MUST precede any DATA or SACK chunk sent * within the association, but MAY be bundled with one or more DATA * chunks or SACK chunk in the same SCTP packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 11 |Chunk Flags | Length = 4 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bits * * Set to zero on transmit and ignored on receipt. */ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ACK back to where the COOKIE ECHO came from.] */ if (retval && chunk && chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); return retval; } /* * Appendix A: Explicit Congestion Notification: * CWR: * * RFC 2481 details a specific bit for a sender to send in the header of * its next outbound TCP segment to indicate to its peer that it has * reduced its congestion window. This is termed the CWR bit. For * SCTP the same indication is made by including the CWR chunk. * This chunk contains one data element, i.e. the TSN number that * was sent in the ECNE chunk. This element represents the lowest * TSN number in the datagram that was originally marked with the * CE bit. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Chunk Type=13 | Flags=00000000| Chunk Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Lowest TSN Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Note: The CWR is considered a Control chunk. */ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const __u32 lowest_tsn, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; struct sctp_cwrhdr cwr; cwr.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0, sizeof(cwr), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecn_cwr_hdr = sctp_addto_chunk(retval, sizeof(cwr), &cwr); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report a reduced congestion window back to where the ECNE * came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Make an ECNE chunk. This is a congestion experienced report. */ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn) { struct sctp_chunk *retval; struct sctp_ecnehdr ecne; ecne.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0, sizeof(ecne), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecne_hdr = sctp_addto_chunk(retval, sizeof(ecne), &ecne); nodata: return retval; } /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_datahdr dp; /* We assign the TSN as LATE as possible, not here when * creating the chunk. */ memset(&dp, 0, sizeof(dp)); dp.ppid = sinfo->sinfo_ppid; dp.stream = htons(sinfo->sinfo_stream); /* Set the flags for an unordered send. */ if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } /* Create a selective ackowledgement (SACK) for the given * association. This reports on which TSN's we've seen to date, * including duplicates and gaps. */ struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc) { struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; struct sctp_gap_ack_block gabs[SCTP_MAX_GABS]; __u16 num_gabs, num_dup_tsns; struct sctp_transport *trans; struct sctp_chunk *retval; struct sctp_sackhdr sack; __u32 ctsn; int len; memset(gabs, 0, sizeof(gabs)); ctsn = sctp_tsnmap_get_ctsn(map); pr_debug("%s: sackCTSNAck sent:0x%x\n", __func__, ctsn); /* How much room is needed in the chunk? */ num_gabs = sctp_tsnmap_num_gabs(map, gabs); num_dup_tsns = sctp_tsnmap_num_dups(map); /* Initialize the SACK header. */ sack.cum_tsn_ack = htonl(ctsn); sack.a_rwnd = htonl(asoc->a_rwnd); sack.num_gap_ack_blocks = htons(num_gabs); sack.num_dup_tsns = htons(num_dup_tsns); len = sizeof(sack) + sizeof(struct sctp_gap_ack_block) * num_gabs + sizeof(__u32) * num_dup_tsns; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk to * which it is replying. This rule should also be followed if * the endpoint is bundling DATA chunks together with the * reply chunk. * * However, when acknowledging multiple DATA chunks received * in packets from different source addresses in a single * SACK, the SACK chunk may be transmitted to one of the * destination transport addresses from which the DATA or * control chunks being acknowledged were received. * * [BUG: We do not implement the following paragraph. * Perhaps we should remember the last transport we used for a * SACK and avoid that (if possible) if we have seen any * duplicates. --piggy] * * When a receiver of a duplicate DATA chunk sends a SACK to a * multi- homed endpoint it MAY be beneficial to vary the * destination address and not use the source address of the * DATA chunk. The reason being that receiving a duplicate * from a multi-homed endpoint might indicate that the return * path (as specified in the source address of the DATA chunk) * for the SACK is broken. * * [Send to the address from which we last received a DATA chunk.] */ retval->transport = asoc->peer.last_data_from; retval->subh.sack_hdr = sctp_addto_chunk(retval, sizeof(sack), &sack); /* Add the gap ack block information. */ if (num_gabs) sctp_addto_chunk(retval, sizeof(__u32) * num_gabs, gabs); /* Add the duplicate TSN information. */ if (num_dup_tsns) { asoc->stats.idupchunks += num_dup_tsns; sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, sctp_tsnmap_get_dups(map)); } /* Once we have a sack generated, check to see what our sack * generation is, if its 0, reset the transports to 0, and reset * the association generation to 1 * * The idea is that zero is never used as a valid generation for the * association so no transport will match after a wrap event like this, * Until the next sack */ if (++asoc->peer.sack_generation == 0) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) trans->sack_generation = 0; asoc->peer.sack_generation = 1; } nodata: return retval; } /* Make a SHUTDOWN chunk. */ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_shutdownhdr shut; struct sctp_chunk *retval; __u32 ctsn; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); shut.cum_tsn_ack = htonl(ctsn); retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, sizeof(shut), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.shutdown_hdr = sctp_addto_chunk(retval, sizeof(shut), &shut); if (chunk) retval->transport = chunk->transport; nodata: return retval; } struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ACK back to where the SHUTDOWN came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } struct sctp_chunk *sctp_make_shutdown_complete( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association (vtag will be * reflected) */ flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK * came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Create an ABORT. Note that we set the T bit if we have no * association, except when responding to an INIT (sctpimpguide 2.41). */ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const size_t hint) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association and 'chunk' is not * an INIT (vtag will be reflected). */ if (!asoc) { if (chunk && chunk->chunk_hdr && chunk->chunk_hdr->type == SCTP_CID_INIT) flags = 0; else flags = SCTP_CHUNK_FLAG_T; } retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Helper to create ABORT with a NO_USER_DATA error. */ struct sctp_chunk *sctp_make_abort_no_data( const struct sctp_association *asoc, const struct sctp_chunk *chunk, __u32 tsn) { struct sctp_chunk *retval; __be32 payload; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(tsn)); if (!retval) goto no_mem; /* Put the tsn back into network byte order. */ payload = htonl(tsn); sctp_init_cause(retval, SCTP_ERROR_NO_DATA, sizeof(payload)); sctp_addto_chunk(retval, sizeof(payload), (const void *)&payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (chunk) retval->transport = chunk->transport; no_mem: return retval; } /* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error. */ struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, struct msghdr *msg, size_t paylen) { struct sctp_chunk *retval; void *payload = NULL; int err; retval = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr) + paylen); if (!retval) goto err_chunk; if (paylen) { /* Put the msg_iov together into payload. */ payload = kmalloc(paylen, GFP_KERNEL); if (!payload) goto err_payload; err = memcpy_from_msg(payload, msg, paylen); if (err < 0) goto err_copy; } sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, paylen); sctp_addto_chunk(retval, paylen, payload); if (paylen) kfree(payload); return retval; err_copy: kfree(payload); err_payload: sctp_chunk_free(retval); retval = NULL; err_chunk: return retval; } /* Append bytes to the end of a parameter. Will panic if chunk is not big * enough. */ static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); void *target; target = skb_put(chunk->skb, len); if (data) memcpy(target, data, len); else memset(target, 0, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */ struct sctp_chunk *sctp_make_abort_violation( const struct sctp_association *asoc, const struct sctp_chunk *chunk, const __u8 *payload, const size_t paylen) { struct sctp_chunk *retval; struct sctp_paramhdr phdr; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + paylen + sizeof(phdr)); if (!retval) goto end; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, paylen + sizeof(phdr)); phdr.type = htons(chunk->chunk_hdr->type); phdr.length = chunk->chunk_hdr->length; sctp_addto_chunk(retval, paylen, payload); sctp_addto_param(retval, sizeof(phdr), &phdr); end: return retval; } struct sctp_chunk *sctp_make_violation_paramlen( const struct sctp_association *asoc, const struct sctp_chunk *chunk, struct sctp_paramhdr *param) { static const char error[] = "The following parameter had invalid length:"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr) + sizeof(*param); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error) + sizeof(*param)); sctp_addto_chunk(retval, sizeof(error), error); sctp_addto_param(retval, sizeof(*param), param); nodata: return retval; } struct sctp_chunk *sctp_make_violation_max_retrans( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { static const char error[] = "Association exceeded its max_retrans count"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error)); sctp_addto_chunk(retval, sizeof(error), error); nodata: return retval; } struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_new_encap_port_hdr nep; struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(nep)); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep)); nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port; nep.new_port = chunk->transport->encap_port; sctp_addto_chunk(retval, sizeof(nep), &nep); nodata: return retval; } /* Make a HEARTBEAT chunk. */ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, const struct sctp_transport *transport, __u32 probe_size) { struct sctp_sender_hb_info hbinfo = {}; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo), GFP_ATOMIC); if (!retval) goto nodata; hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO; hbinfo.param_hdr.length = htons(sizeof(hbinfo)); hbinfo.daddr = transport->ipaddr; hbinfo.sent_at = jiffies; hbinfo.hb_nonce = transport->hb_nonce; hbinfo.probe_size = probe_size; /* Cast away the 'const', as this is just telling the chunk * what transport it belongs to. */ retval->transport = (struct sctp_transport *) transport; retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo), &hbinfo); retval->pmtu_probe = !!probe_size; nodata: return retval; } struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, const size_t paylen) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [HBACK back to where the HEARTBEAT came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* RFC4820 3. Padding Chunk (PAD) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x84 | Flags=0 | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * \ Padding Data / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_PAD, 0, len, GFP_ATOMIC); if (!retval) return NULL; skb_put_zero(retval->skb, len); retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /* Create an Operation Error chunk with the specified space reserved. * This routine can be used for containing multiple causes in the chunk. */ static struct sctp_chunk *sctp_make_op_error_space( const struct sctp_association *asoc, const struct sctp_chunk *chunk, size_t size) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0, sizeof(struct sctp_errhdr) + size, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Create an Operation Error chunk of a fixed size, specifically, * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads. * This is a helper function to allocate an error chunk for those * invalid parameter codes in which we may not want to report all the * errors, if the incoming chunk is large. If it can't fit in a single * packet, we ignore it. */ static inline struct sctp_chunk *sctp_make_op_error_limited( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { size_t size = SCTP_DEFAULT_MAXSEGMENT; struct sctp_sock *sp = NULL; if (asoc) { size = min_t(size_t, size, asoc->pathmtu); sp = sctp_sk(asoc->base.sk); } size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr)); return sctp_make_op_error_space(asoc, chunk, size); } /* Create an Operation Error chunk. */ struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __be16 cause_code, const void *payload, size_t paylen, size_t reserve_tail) { struct sctp_chunk *retval; retval = sctp_make_op_error_space(asoc, chunk, paylen + reserve_tail); if (!retval) goto nodata; sctp_init_cause(retval, cause_code, paylen + reserve_tail); sctp_addto_chunk(retval, paylen, payload); if (reserve_tail) sctp_addto_param(retval, reserve_tail, NULL); nodata: return retval; } struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (unlikely(!hmac_desc)) return NULL; retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0, hmac_desc->hmac_len + sizeof(auth_hdr), GFP_ATOMIC); if (!retval) return NULL; auth_hdr.hmac_id = htons(hmac_desc->hmac_id); auth_hdr.shkey_id = htons(key_id); retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr), &auth_hdr); skb_put_zero(retval->skb, hmac_desc->hmac_len); /* Adjust the chunk header to include the empty MAC */ retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + hmac_desc->hmac_len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Turn an skb into a chunk. * FIXME: Eventually move the structure directly inside the skb->cb[]. * * sctpimpguide-05.txt Section 2.8.2 * M1) Each time a new DATA chunk is transmitted * set the 'TSN.Missing.Report' count for that TSN to 0. The * 'TSN.Missing.Report' count will be used to determine missing chunks * and when to fast retransmit. * */ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, const struct sctp_association *asoc, struct sock *sk, gfp_t gfp) { struct sctp_chunk *retval; retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp); if (!retval) goto nodata; if (!sk) pr_debug("%s: chunkifying skb:%p w/o an sk\n", __func__, skb); INIT_LIST_HEAD(&retval->list); retval->skb = skb; retval->asoc = (struct sctp_association *)asoc; retval->singleton = 1; retval->fast_retransmit = SCTP_CAN_FRTX; /* Polish the bead hole. */ INIT_LIST_HEAD(&retval->transmitted_list); INIT_LIST_HEAD(&retval->frag_list); SCTP_DBG_OBJCNT_INC(chunk); refcount_set(&retval->refcnt, 1); nodata: return retval; } /* Set chunk->source and dest based on the IP header in chunk->skb. */ void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src, union sctp_addr *dest) { memcpy(&chunk->source, src, sizeof(union sctp_addr)); memcpy(&chunk->dest, dest, sizeof(union sctp_addr)); } /* Extract the source address from a chunk. */ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk) { /* If we have a known transport, use that. */ if (chunk->transport) { return &chunk->transport->ipaddr; } else { /* Otherwise, extract it from the IP header. */ return &chunk->source; } } /* Create a new chunk, setting the type and flags headers from the * arguments, reserving enough space for a 'paylen' byte payload. */ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunkhdr *chunk_hdr; struct sctp_chunk *retval; struct sk_buff *skb; struct sock *sk; int chunklen; chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen); if (chunklen > SCTP_MAX_CHUNK_LEN) goto nodata; /* No need to allocate LL here, as this is only a chunk. */ skb = alloc_skb(chunklen, gfp); if (!skb) goto nodata; /* Make room for the chunk header. */ chunk_hdr = (struct sctp_chunkhdr *)skb_put(skb, sizeof(*chunk_hdr)); chunk_hdr->type = type; chunk_hdr->flags = flags; chunk_hdr->length = htons(sizeof(*chunk_hdr)); sk = asoc ? asoc->base.sk : NULL; retval = sctp_chunkify(skb, asoc, sk, gfp); if (!retval) { kfree_skb(skb); goto nodata; } retval->chunk_hdr = chunk_hdr; retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(*chunk_hdr); /* Determine if the chunk needs to be authenticated */ if (sctp_auth_send_cid(type, asoc)) retval->auth = 1; return retval; nodata: return NULL; } static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp); } struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp); } static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunk *chunk; chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp); if (chunk) sctp_control_set_owner_w(chunk); return chunk; } /* Release the memory occupied by a chunk. */ static void sctp_chunk_destroy(struct sctp_chunk *chunk) { BUG_ON(!list_empty(&chunk->list)); list_del_init(&chunk->transmitted_list); consume_skb(chunk->skb); consume_skb(chunk->auth_chunk); SCTP_DBG_OBJCNT_DEC(chunk); kmem_cache_free(sctp_chunk_cachep, chunk); } /* Possibly, free the chunk. */ void sctp_chunk_free(struct sctp_chunk *chunk) { /* Release our reference on the message tracker. */ if (chunk->msg) sctp_datamsg_put(chunk->msg); sctp_chunk_put(chunk); } /* Grab a reference to the chunk. */ void sctp_chunk_hold(struct sctp_chunk *ch) { refcount_inc(&ch->refcnt); } /* Release a reference to the chunk. */ void sctp_chunk_put(struct sctp_chunk *ch) { if (refcount_dec_and_test(&ch->refcnt)) sctp_chunk_destroy(ch); } /* Append bytes to the end of a chunk. Will panic if chunk is not big * enough. */ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); int padlen = SCTP_PAD4(chunklen) - chunklen; void *target; skb_put_zero(chunk->skb, padlen); target = skb_put_data(chunk->skb, data, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + padlen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Append bytes from user space to the end of a chunk. Will panic if * chunk is not big enough. * Returns a kernel err value. */ int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len, struct iov_iter *from) { void *target; /* Make room in chunk for data. */ target = skb_put(chunk->skb, len); /* Copy data (whole iovec) into chunk */ if (!copy_from_iter_full(target, len, from)) return -EFAULT; /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(ntohs(chunk->chunk_hdr->length) + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return 0; } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; struct sctp_datamsg *msg; __u16 ssn, sid; if (chunk->has_ssn) return; /* All fragments will be on the same stream */ sid = ntohs(chunk->subh.data_hdr->stream); stream = &chunk->asoc->stream; /* Now assign the sequence number to the entire message. * All fragments must have the same stream sequence number. */ msg = chunk->msg; list_for_each_entry(lchunk, &msg->chunks, frag_list) { if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { ssn = 0; } else { if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) ssn = sctp_ssn_next(stream, out, sid); else ssn = sctp_ssn_peek(stream, out, sid); } lchunk->subh.data_hdr->ssn = htons(ssn); lchunk->has_ssn = 1; } } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk) { if (!chunk->has_tsn) { /* This is the last possible instant to * assign a TSN. */ chunk->subh.data_hdr->tsn = htonl(sctp_association_get_next_tsn(chunk->asoc)); chunk->has_tsn = 1; } } /* Create a CLOSED association to use with an incoming packet. */ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc; enum sctp_scope scope; struct sk_buff *skb; /* Create the bare association. */ scope = sctp_scope(sctp_source(chunk)); asoc = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!asoc) goto nodata; asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1); nodata: return asoc; } /* Build a cookie representing asoc. * This INCLUDES the param header needed to put the cookie in the INIT ACK. */ static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len) { struct sctp_signed_cookie *cookie; struct sctp_cookie_param *retval; int headersize, bodysize; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_paramhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = sizeof(struct sctp_cookie) + ntohs(init_chunk->chunk_hdr->length) + addrs_len; /* Pad out the cookie to a multiple to make the signature * functions simpler to write. */ if (bodysize % SCTP_COOKIE_MULTIPLE) bodysize += SCTP_COOKIE_MULTIPLE - (bodysize % SCTP_COOKIE_MULTIPLE); *cookie_len = headersize + bodysize; /* Clear this memory since we are sending this data structure * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); if (!retval) goto nodata; cookie = (struct sctp_signed_cookie *) retval->body; /* Set up the parameter header. */ retval->p.type = SCTP_PARAM_STATE_COOKIE; retval->p.length = htons(*cookie_len); /* Copy the cookie part of the association itself. */ cookie->c = asoc->c; /* Save the raw address list length in the cookie. */ cookie->c.raw_addr_list_len = addrs_len; /* Remember PR-SCTP capability. */ cookie->c.prsctp_capable = asoc->peer.prsctp_capable; /* Save adaptation indication in the cookie. */ cookie->c.adaptation_ind = asoc->peer.adaptation_ind; /* Set an expiration time for the cookie. */ cookie->c.expiration = ktime_add(asoc->cookie_life, ktime_get_real()); /* Copy the peer's init packet. */ memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr, ntohs(init_chunk->chunk_hdr->length)); /* Copy the raw local address list of the association. */ memcpy((__u8 *)&cookie->c.peer_init[0] + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); if (sctp_sk(ep->base.sk)->hmac) { struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; int err; /* Sign the message. */ err = crypto_shash_setkey(tfm, ep->secret_key, sizeof(ep->secret_key)) ?: crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, cookie->signature); if (err) goto free_cookie; } return retval; free_cookie: kfree(retval); nodata: *cookie_len = 0; return NULL; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ struct sctp_association *sctp_unpack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp, int *error, struct sctp_chunk **errp) { struct sctp_association *retval = NULL; int headersize, bodysize, fixed_size; struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; __u8 *digest = ep->digest; enum sctp_scope scope; unsigned int len; ktime_t kt; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_chunkhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = ntohs(chunk->chunk_hdr->length) - headersize; fixed_size = headersize + sizeof(struct sctp_cookie); /* Verify that the chunk looks like it even has a cookie. * There must be enough room for our cookie and our peer's * INIT chunk. */ len = ntohs(chunk->chunk_hdr->length); if (len < fixed_size + sizeof(struct sctp_chunkhdr)) goto malformed; /* Verify that the cookie has been padded out. */ if (bodysize % SCTP_COOKIE_MULTIPLE) goto malformed; /* Process the cookie. */ cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; if (!sctp_sk(ep->base.sk)->hmac) goto no_hmac; /* Check the signature. */ { struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; int err; err = crypto_shash_setkey(tfm, ep->secret_key, sizeof(ep->secret_key)) ?: crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, digest); if (err) { *error = -SCTP_IERROR_NOMEM; goto fail; } } if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { *error = -SCTP_IERROR_BAD_SIG; goto fail; } no_hmac: /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the * verification tag within the SCTP common header of the received * packet. If these values do not match the packet MUST be silently * discarded, */ if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) { *error = -SCTP_IERROR_BAD_TAG; goto fail; } if (chunk->sctp_hdr->source != bear_cookie->peer_addr.v4.sin_port || ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) { *error = -SCTP_IERROR_BAD_PORTS; goto fail; } /* Check to see if the cookie is stale. If there is already * an association, there is no need to check cookie's expiration * for init collision case of lost COOKIE ACK. * If skb has been timestamped, then use the stamp, otherwise * use current time. This introduces a small possibility that * a cookie may be considered expired, but this would only slow * down the new association establishment instead of every packet. */ if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) kt = skb_get_ktime(skb); else kt = ktime_get_real(); if (!asoc && ktime_before(bear_cookie->expiration, kt)) { suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration)); __be32 n = htonl(usecs); /* * Section 3.3.10.3 Stale Cookie Error (3) * * Cause of error * --------------- * Stale Cookie Error: Indicates the receipt of a valid State * Cookie that has expired. */ *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_STALE_COOKIE, &n, sizeof(n), 0); if (*errp) *error = -SCTP_IERROR_STALE_COOKIE; else *error = -SCTP_IERROR_NOMEM; goto fail; } /* Make a new base association. */ scope = sctp_scope(sctp_source(chunk)); retval = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!retval) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Set up our peer's port number. */ retval->peer.port = ntohs(chunk->sctp_hdr->source); /* Populate the association from the cookie. */ memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie)); if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie, GFP_ATOMIC) < 0) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Also, add the destination address. */ if (list_empty(&retval->base.bind_addr.address_list)) { sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, sizeof(chunk->dest), SCTP_ADDR_SRC, GFP_ATOMIC); } retval->next_tsn = retval->c.initial_tsn; retval->ctsn_ack_point = retval->next_tsn - 1; retval->addip_serial = retval->c.initial_tsn; retval->strreset_outseq = retval->c.initial_tsn; retval->adv_peer_ack_point = retval->ctsn_ack_point; retval->peer.prsctp_capable = retval->c.prsctp_capable; retval->peer.adaptation_ind = retval->c.adaptation_ind; /* The INIT stuff will be done by the side effects. */ return retval; fail: if (retval) sctp_association_free(retval); return NULL; malformed: /* Yikes! The packet is either corrupt or deliberately * malformed. */ *error = -SCTP_IERROR_MALFORMED; goto fail; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ struct __sctp_missing { __be32 num_missing; __be16 type; } __packed; /* * Report a missing mandatory parameter. */ static int sctp_process_missing_param(const struct sctp_association *asoc, enum sctp_param paramtype, struct sctp_chunk *chunk, struct sctp_chunk **errp) { struct __sctp_missing report; __u16 len; len = SCTP_PAD4(sizeof(report)); /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, len); if (*errp) { report.num_missing = htonl(1); report.type = paramtype; sctp_init_cause(*errp, SCTP_ERROR_MISS_PARAM, sizeof(report)); sctp_addto_chunk(*errp, sizeof(report), &report); } /* Stop processing this chunk. */ return 0; } /* Report an Invalid Mandatory Parameter. */ static int sctp_process_inv_mandatory(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* Invalid Mandatory Parameter Error has no payload. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, 0); if (*errp) sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, 0); /* Stop processing this chunk. */ return 0; } static int sctp_process_inv_paramlength(const struct sctp_association *asoc, struct sctp_paramhdr *param, const struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* This is a fatal error. Any accumulated non-fatal errors are * not reported. */ if (*errp) sctp_chunk_free(*errp); /* Create an error chunk and fill it in with our payload. */ *errp = sctp_make_violation_paramlen(asoc, chunk, param); return 0; } /* Do not attempt to handle the HOST_NAME parm. However, do * send back an indicator to the peer. */ static int sctp_process_hn_param(const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { __u16 len = ntohs(param.p->length); /* Processing of the HOST_NAME parameter will generate an * ABORT. If we've accumulated any non-fatal errors, they * would be unrecognized parameters and we should not include * them in the ABORT. */ if (*errp) sctp_chunk_free(*errp); *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED, param.v, len, 0); /* Stop processing this chunk. */ return 0; } static int sctp_verify_ext_param(struct net *net, const struct sctp_endpoint *ep, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int have_asconf = 0; int have_auth = 0; int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_AUTH: have_auth = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: have_asconf = 1; break; } } /* ADD-IP Security: The draft requires us to ABORT or ignore the * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not. Do this * only if ADD-IP is turned on and we are not backward-compatible * mode. */ if (net->sctp.addip_noauth) return 1; if (ep->asconf_enable && !have_auth && have_asconf) return 0; return 1; } static void sctp_process_ext_param(struct sctp_association *asoc, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_RECONF: if (asoc->ep->reconf_enable) asoc->peer.reconf_capable = 1; break; case SCTP_CID_FWD_TSN: if (asoc->ep->prsctp_enable) asoc->peer.prsctp_capable = 1; break; case SCTP_CID_AUTH: /* if the peer reports AUTH, assume that he * supports AUTH. */ if (asoc->ep->auth_enable) asoc->peer.auth_capable = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: if (asoc->ep->asconf_enable) asoc->peer.asconf_capable = 1; break; case SCTP_CID_I_DATA: if (asoc->ep->intl_enable) asoc->peer.intl_capable = 1; break; default: break; } } } /* RFC 3.2.1 & the Implementers Guide 2.2. * * The Parameter Types are encoded such that the * highest-order two bits specify the action that must be * taken if the processing endpoint does not recognize the * Parameter Type. * * 00 - Stop processing this parameter; do not process any further * parameters within this chunk * * 01 - Stop processing this parameter, do not process any further * parameters within this chunk, and report the unrecognized * parameter in an 'Unrecognized Parameter' ERROR chunk. * * 10 - Skip this parameter and continue processing. * * 11 - Skip this parameter and continue processing but * report the unrecognized parameter in an * 'Unrecognized Parameter' ERROR chunk. * * Return value: * SCTP_IERROR_NO_ERROR - continue with the chunk * SCTP_IERROR_ERROR - stop and report an error. * SCTP_IERROR_NOMEME - out of memory. */ static enum sctp_ierror sctp_process_unk_param( const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { int retval = SCTP_IERROR_NO_ERROR; switch (param.p->type & SCTP_PARAM_ACTION_MASK) { case SCTP_PARAM_ACTION_DISCARD: retval = SCTP_IERROR_ERROR; break; case SCTP_PARAM_ACTION_SKIP: break; case SCTP_PARAM_ACTION_DISCARD_ERR: retval = SCTP_IERROR_ERROR; fallthrough; case SCTP_PARAM_ACTION_SKIP_ERR: /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) { *errp = sctp_make_op_error_limited(asoc, chunk); if (!*errp) { /* If there is no memory for generating the * ERROR report as specified, an ABORT will be * triggered to the peer and the association * won't be established. */ retval = SCTP_IERROR_NOMEM; break; } } if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, ntohs(param.p->length))) sctp_addto_chunk(*errp, ntohs(param.p->length), param.v); break; default: break; } return retval; } /* Verify variable length parameters * Return values: * SCTP_IERROR_ABORT - trigger an ABORT * SCTP_IERROR_NOMEM - out of memory (abort) * SCTP_IERROR_ERROR - stop processing, trigger an ERROR * SCTP_IERROR_NO_ERROR - continue with the chunk */ static enum sctp_ierror sctp_verify_param(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, union sctp_params param, enum sctp_cid cid, struct sctp_chunk *chunk, struct sctp_chunk **err_chunk) { struct sctp_hmac_algo_param *hmacs; int retval = SCTP_IERROR_NO_ERROR; __u16 n_elt, id = 0; int i; /* FIXME - This routine is not looking at each parameter per the * chunk type, i.e., unrecognized parameters should be further * identified based on the chunk id. */ switch (param.p->type) { case SCTP_PARAM_IPV4_ADDRESS: case SCTP_PARAM_IPV6_ADDRESS: case SCTP_PARAM_COOKIE_PRESERVATIVE: case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: case SCTP_PARAM_STATE_COOKIE: case SCTP_PARAM_HEARTBEAT_INFO: case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: case SCTP_PARAM_ECN_CAPABLE: case SCTP_PARAM_ADAPTATION_LAYER_IND: break; case SCTP_PARAM_SUPPORTED_EXT: if (!sctp_verify_ext_param(net, ep, param)) return SCTP_IERROR_ABORT; break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto unhandled; if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ sctp_process_hn_param(asoc, param, chunk, err_chunk); retval = SCTP_IERROR_ABORT; break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (ep->prsctp_enable) break; goto unhandled; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Secion 6.1 * If the random number is not 32 byte long the association * MUST be aborted. The ABORT chunk SHOULD contain the error * cause 'Protocol Violation'. */ if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Section 3.2 * The CHUNKS parameter MUST be included once in the INIT or * INIT-ACK chunk if the sender wants to receive authenticated * chunks. Its maximum length is 260 bytes. */ if (260 < ntohs(param.p->length)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto unhandled; hmacs = (struct sctp_hmac_algo_param *)param.p; n_elt = (ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) >> 1; /* SCTP-AUTH: Section 6.1 * The HMAC algorithm based on SHA-1 MUST be supported and * included in the HMAC-ALGO parameter. */ for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); if (id == SCTP_AUTH_HMAC_ID_SHA1) break; } if (id != SCTP_AUTH_HMAC_ID_SHA1) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; unhandled: default: pr_debug("%s: unrecognized param:%d for chunk:%d\n", __func__, ntohs(param.p->type), cid); retval = sctp_process_unk_param(asoc, param, chunk, err_chunk); break; } return retval; } /* Verify the INIT packet before we process it. */ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, enum sctp_cid cid, struct sctp_init_chunk *peer_init, struct sctp_chunk *chunk, struct sctp_chunk **errp) { union sctp_params param; bool has_cookie = false; int result; /* Check for missing mandatory parameters. Note: Initial TSN is * also mandatory, but is not checked here since the valid range * is 0..2**32-1. RFC4960, section 3.3.3. */ if (peer_init->init_hdr.num_outbound_streams == 0 || peer_init->init_hdr.num_inbound_streams == 0 || peer_init->init_hdr.init_tag == 0 || ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW) return sctp_process_inv_mandatory(asoc, chunk, errp); sctp_walk_params(param, peer_init, init_hdr.params) { if (param.p->type == SCTP_PARAM_STATE_COOKIE) has_cookie = true; } /* There is a possibility that a parameter length was bad and * in that case we would have stoped walking the parameters. * The current param.p would point at the bad one. * Current consensus on the mailing list is to generate a PROTOCOL * VIOLATION error. We build the ERROR chunk here and let the normal * error handling code build and send the packet. */ if (param.v != (void *)chunk->chunk_end) return sctp_process_inv_paramlength(asoc, param.p, chunk, errp); /* The only missing mandatory param possible today is * the state cookie for an INIT-ACK chunk. */ if ((SCTP_CID_INIT_ACK == cid) && !has_cookie) return sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE, chunk, errp); /* Verify all the variable length parameters */ sctp_walk_params(param, peer_init, init_hdr.params) { result = sctp_verify_param(net, ep, asoc, param, cid, chunk, errp); switch (result) { case SCTP_IERROR_ABORT: case SCTP_IERROR_NOMEM: return 0; case SCTP_IERROR_ERROR: return 1; case SCTP_IERROR_NO_ERROR: default: break; } } /* for (loop through all parameters) */ return 1; } /* Unpack the parameters in an INIT packet into an association. * Returns 0 on failure, else success. * FIXME: This is an association method. */ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, const union sctp_addr *peer_addr, struct sctp_init_chunk *peer_init, gfp_t gfp) { struct sctp_transport *transport; struct list_head *pos, *temp; union sctp_params param; union sctp_addr addr; struct sctp_af *af; int src_match = 0; /* We must include the address that the INIT packet came from. * This is the only address that matters for an INIT packet. * When processing a COOKIE ECHO, we retrieve the from address * of the INIT from the cookie. */ /* This implementation defaults to making the first transport * added as the primary transport. The source address seems to * be a better choice than any of the embedded addresses. */ asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) goto nomem; if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr)) src_match = 1; /* Process the initialization parameters. */ sctp_walk_params(param, peer_init, init_hdr.params) { if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, chunk->sctp_hdr->source, 0)) continue; if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) src_match = 1; } if (!sctp_process_param(asoc, param, peer_addr, gfp)) goto clean_up; } /* source address of chunk may not match any valid address */ if (!src_match) goto clean_up; /* AUTH: After processing the parameters, make sure that we * have all the required info to potentially do authentications. */ if (asoc->peer.auth_capable && (!asoc->peer.peer_random || !asoc->peer.peer_hmacs)) asoc->peer.auth_capable = 0; /* In a non-backward compatible mode, if the peer claims * support for ADD-IP but not AUTH, the ADD-IP spec states * that we MUST ABORT the association. Section 6. The section * also give us an option to silently ignore the packet, which * is what we'll do here. */ if (!asoc->base.net->sctp.addip_noauth && (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP | SCTP_PARAM_DEL_IP | SCTP_PARAM_SET_PRIMARY); asoc->peer.asconf_capable = 0; goto clean_up; } /* Walk list of transports, removing transports in the UNKNOWN state. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state == SCTP_UNKNOWN) { sctp_assoc_rm_peer(asoc, transport); } } /* The fixed INIT headers are always in network byte * order. */ asoc->peer.i.init_tag = ntohl(peer_init->init_hdr.init_tag); asoc->peer.i.a_rwnd = ntohl(peer_init->init_hdr.a_rwnd); asoc->peer.i.num_outbound_streams = ntohs(peer_init->init_hdr.num_outbound_streams); asoc->peer.i.num_inbound_streams = ntohs(peer_init->init_hdr.num_inbound_streams); asoc->peer.i.initial_tsn = ntohl(peer_init->init_hdr.initial_tsn); asoc->strreset_inseq = asoc->peer.i.initial_tsn; /* Apply the upper bounds for output streams based on peer's * number of inbound streams. */ if (asoc->c.sinit_num_ostreams > ntohs(peer_init->init_hdr.num_inbound_streams)) { asoc->c.sinit_num_ostreams = ntohs(peer_init->init_hdr.num_inbound_streams); } if (asoc->c.sinit_max_instreams > ntohs(peer_init->init_hdr.num_outbound_streams)) { asoc->c.sinit_max_instreams = ntohs(peer_init->init_hdr.num_outbound_streams); } /* Copy Initiation tag from INIT to VT_peer in cookie. */ asoc->c.peer_vtag = asoc->peer.i.init_tag; /* Peer Rwnd : Current calculated value of the peer's rwnd. */ asoc->peer.rwnd = asoc->peer.i.a_rwnd; /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily * high (for example, implementations MAY use the size of the receiver * advertised window). */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { transport->ssthresh = asoc->peer.i.a_rwnd; } /* Set up the TSN tracking pieces. */ if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, asoc->peer.i.initial_tsn, gfp)) goto clean_up; /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number * * The stream sequence number in all the streams shall start * from 0 when the association is established. Also, when the * stream sequence number reaches the value 65535 the next * stream sequence number shall be set to 0. */ if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, gfp)) goto clean_up; /* Update frag_point when stream_interleave may get changed. */ sctp_assoc_update_frag_point(asoc); if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) goto clean_up; /* ADDIP Section 4.1 ASCONF Chunk Procedures * * When an endpoint has an ASCONF signaled change to be sent to the * remote endpoint it should do the following: * ... * A2) A serial number should be assigned to the Chunk. The serial * number should be a monotonically increasing number. All serial * numbers are defined to be initialized at the start of the * association to the same value as the Initial TSN. */ asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1; return 1; clean_up: /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state != SCTP_ACTIVE) sctp_assoc_rm_peer(asoc, transport); } nomem: return 0; } /* Update asoc with the option described in param. * * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT * * asoc is the association to update. * param is the variable length parameter to use for update. * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO. * If the current packet is an INIT we want to minimize the amount of * work we do. In particular, we should not build transport * structures for the addresses. */ static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp) { struct sctp_endpoint *ep = asoc->ep; union sctp_addr_param *addr_param; struct net *net = asoc->base.net; struct sctp_transport *t; enum sctp_scope scope; union sctp_addr addr; struct sctp_af *af; int retval = 1, i; u32 stale; __u16 sat; /* We maintain all INIT parameters in network byte order all the * time. This allows us to not worry about whether the parameters * came from a fresh INIT, and INIT ACK, or were stored in a cookie. */ switch (param.p->type) { case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 != asoc->base.sk->sk_family) break; goto do_addr_param; case SCTP_PARAM_IPV4_ADDRESS: /* v4 addresses are not allowed on v6-only socket */ if (ipv6_only_sock(asoc->base.sk)) break; do_addr_param: af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0)) break; scope = sctp_scope(peer_addr); if (sctp_in_scope(net, &addr, scope)) if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) return 0; break; case SCTP_PARAM_COOKIE_PRESERVATIVE: if (!net->sctp.cookie_preserve_enable) break; stale = ntohl(param.life->lifespan_increment); /* Suggested Cookie Life span increment's unit is msec, * (1/1000sec). */ asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale); break; case SCTP_PARAM_HOST_NAME_ADDRESS: pr_debug("%s: unimplemented SCTP_HOST_NAME_ADDRESS\n", __func__); break; case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: /* Turn off the default values first so we'll know which * ones are really set by the peer. */ asoc->peer.ipv4_address = 0; asoc->peer.ipv6_address = 0; /* Assume that peer supports the address family * by which it sends a packet. */ if (peer_addr->sa.sa_family == AF_INET6) asoc->peer.ipv6_address = 1; else if (peer_addr->sa.sa_family == AF_INET) asoc->peer.ipv4_address = 1; /* Cycle through address types; avoid divide by 0. */ sat = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); if (sat) sat /= sizeof(__u16); for (i = 0; i < sat; ++i) { switch (param.sat->types[i]) { case SCTP_PARAM_IPV4_ADDRESS: asoc->peer.ipv4_address = 1; break; case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 == asoc->base.sk->sk_family) asoc->peer.ipv6_address = 1; break; case SCTP_PARAM_HOST_NAME_ADDRESS: asoc->peer.hostname_address = 1; break; default: /* Just ignore anything else. */ break; } } break; case SCTP_PARAM_STATE_COOKIE: asoc->peer.cookie_len = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); kfree(asoc->peer.cookie); asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp); if (!asoc->peer.cookie) retval = 0; break; case SCTP_PARAM_HEARTBEAT_INFO: /* Would be odd to receive, but it causes no problems. */ break; case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: /* Rejected during verify stage. */ break; case SCTP_PARAM_ECN_CAPABLE: if (asoc->ep->ecn_enable) { asoc->peer.ecn_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_ADAPTATION_LAYER_IND: asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind); break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto fall_through; addr_param = param.v + sizeof(struct sctp_addip_param); af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af) break; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) break; if (!af->addr_valid(&addr, NULL, NULL)) break; t = sctp_assoc_lookup_paddr(asoc, &addr); if (!t) break; sctp_assoc_set_primary(asoc, t); break; case SCTP_PARAM_SUPPORTED_EXT: sctp_process_ext_param(asoc, param); break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (asoc->ep->prsctp_enable) { asoc->peer.prsctp_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto fall_through; /* Save peer's random parameter */ kfree(asoc->peer.peer_random); asoc->peer.peer_random = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_random) { retval = 0; break; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto fall_through; /* Save peer's HMAC list */ kfree(asoc->peer.peer_hmacs); asoc->peer.peer_hmacs = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_hmacs) { retval = 0; break; } /* Set the default HMAC the peer requested*/ sctp_auth_asoc_set_default_hmac(asoc, param.hmac_algo); break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto fall_through; kfree(asoc->peer.peer_chunks); asoc->peer.peer_chunks = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_chunks) retval = 0; break; fall_through: default: /* Any unrecognized parameters should have been caught * and handled by sctp_verify_param() which should be * called prior to this routine. Simply log the error * here. */ pr_debug("%s: ignoring param:%d for association:%p.\n", __func__, ntohs(param.p->type), asoc); break; } return retval; } /* Select a new verification tag. */ __u32 sctp_generate_tag(const struct sctp_endpoint *ep) { /* I believe that this random number generator complies with RFC1750. * A tag of 0 is reserved for special cases (e.g. INIT). */ __u32 x; do { get_random_bytes(&x, sizeof(__u32)); } while (x == 0); return x; } /* Select an initial TSN to send during startup. */ __u32 sctp_generate_tsn(const struct sctp_endpoint *ep) { __u32 retval; get_random_bytes(&retval, sizeof(__u32)); return retval; } /* * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Address Parameter and other parameter will not be wrapped in this function */ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc, union sctp_addr *addr, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; union sctp_addr_param addrparam; int addrlen; struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; length += addrlen; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(asoc->addip_serial++); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); retval->param_hdr.v = sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP * 3.2.1 Add IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC001 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * 3.2.2 Delete IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC002 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * */ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, union sctp_addr *laddr, struct sockaddr *addrs, int addrcnt, __be16 flags) { union sctp_addr_param addr_param; struct sctp_addip_param param; int paramlen = sizeof(param); struct sctp_chunk *retval; int addr_param_len = 0; union sctp_addr *addr; int totallen = 0, i; int del_pickup = 0; struct sctp_af *af; void *addr_buf; /* Get total length of all the address parameters. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); totallen += paramlen; totallen += addr_param_len; addr_buf += af->sockaddr_len; if (asoc->asconf_addr_del_pending && !del_pickup) { /* reuse the parameter length from the same scope one */ totallen += paramlen; totallen += addr_param_len; del_pickup = 1; pr_debug("%s: picked same-scope del_pending addr, " "totallen for all addresses is %d\n", __func__, totallen); } } /* Create an asconf chunk with the required length. */ retval = sctp_make_asconf(asoc, laddr, totallen); if (!retval) return NULL; /* Add the address parameters to the asconf chunk. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = flags; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, ¶m); sctp_addto_chunk(retval, addr_param_len, &addr_param); addr_buf += af->sockaddr_len; } if (flags == SCTP_PARAM_ADD_IP && del_pickup) { addr = asoc->asconf_addr_del_pending; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = SCTP_PARAM_DEL_IP; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, ¶m); sctp_addto_chunk(retval, addr_param_len, &addr_param); } return retval; } /* ADDIP * 3.2.4 Set Primary IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type =0xC004 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF chunk with Set Primary IP address parameter. */ struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, union sctp_addr *addr) { struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); union sctp_addr_param addrparam; struct sctp_addip_param param; struct sctp_chunk *retval; int len = sizeof(param); int addrlen; addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; len += addrlen; /* Create the chunk and make asconf header. */ retval = sctp_make_asconf(asoc, addr, len); if (!retval) return NULL; param.param_hdr.type = SCTP_PARAM_SET_PRIMARY; param.param_hdr.length = htons(len); param.crr_id = 0; sctp_addto_chunk(retval, sizeof(param), ¶m); sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x80 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF_ACK chunk with enough space for the parameter responses. */ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc, __u32 serial, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(serial); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); return retval; } /* Add response parameters to an ASCONF_ACK chunk. */ static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id, __be16 err_code, struct sctp_addip_param *asconf_param) { struct sctp_addip_param ack_param; struct sctp_errhdr err_param; int asconf_param_len = 0; int err_param_len = 0; __be16 response_type; if (SCTP_ERROR_NO_ERROR == err_code) { response_type = SCTP_PARAM_SUCCESS_REPORT; } else { response_type = SCTP_PARAM_ERR_CAUSE; err_param_len = sizeof(err_param); if (asconf_param) asconf_param_len = ntohs(asconf_param->param_hdr.length); } /* Add Success Indication or Error Cause Indication parameter. */ ack_param.param_hdr.type = response_type; ack_param.param_hdr.length = htons(sizeof(ack_param) + err_param_len + asconf_param_len); ack_param.crr_id = crr_id; sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param); if (SCTP_ERROR_NO_ERROR == err_code) return; /* Add Error Cause parameter. */ err_param.cause = err_code; err_param.length = htons(err_param_len + asconf_param_len); sctp_addto_chunk(chunk, err_param_len, &err_param); /* Add the failed TLV copied from ASCONF chunk. */ if (asconf_param) sctp_addto_chunk(chunk, asconf_param_len, asconf_param); } /* Process a asconf parameter. */ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, struct sctp_chunk *asconf, struct sctp_addip_param *asconf_param) { union sctp_addr_param *addr_param; struct sctp_transport *peer; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP && asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP && asconf_param->param_hdr.type != SCTP_PARAM_SET_PRIMARY) return SCTP_ERROR_UNKNOWN_PARAM; switch (addr_param->p.type) { case SCTP_PARAM_IPV6_ADDRESS: if (!asoc->peer.ipv6_address) return SCTP_ERROR_DNS_FAILED; break; case SCTP_PARAM_IPV4_ADDRESS: if (!asoc->peer.ipv4_address) return SCTP_ERROR_DNS_FAILED; break; default: return SCTP_ERROR_DNS_FAILED; } af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (unlikely(!af)) return SCTP_ERROR_DNS_FAILED; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) return SCTP_ERROR_DNS_FAILED; /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast * or multicast address. * (note: wildcard is permitted and requires special handling so * make sure we check for that) */ if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb)) return SCTP_ERROR_DNS_FAILED; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* Section 4.2.1: * If the address 0.0.0.0 or ::0 is provided, the source * address of the packet MUST be added. */ if (af->is_any(&addr)) memcpy(&addr, &asconf->source, sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_ADD_IP, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address * request and does not have the local resources to add this * new address to the association, it MUST return an Error * Cause TLV set to the new error code 'Operation Refused * Due to Resource Shortage'. */ peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_UNCONFIRMED); if (!peer) return SCTP_ERROR_RSRC_LOW; /* Start the heartbeat timer. */ sctp_transport_reset_hb_timer(peer); asoc->new_transport = peer; break; case SCTP_PARAM_DEL_IP: /* ADDIP 4.3 D7) If a request is received to delete the * last remaining IP address of a peer endpoint, the receiver * MUST send an Error Cause TLV with the error cause set to the * new error code 'Request to Delete Last Remaining IP Address'. */ if (asoc->peer.transport_count == 1) return SCTP_ERROR_DEL_LAST_IP; /* ADDIP 4.3 D8) If a request is received to delete an IP * address which is also the source address of the IP packet * which contained the ASCONF chunk, the receiver MUST reject * this request. To reject the request the receiver MUST send * an Error Cause TLV set to the new error code 'Request to * Delete Source IP Address' */ if (sctp_cmp_addr_exact(&asconf->source, &addr)) return SCTP_ERROR_DEL_SRC_IP; /* Section 4.2.2 * If the address 0.0.0.0 or ::0 is provided, all * addresses of the peer except the source address of the * packet MUST be deleted. */ if (af->is_any(&addr)) { sctp_assoc_set_primary(asoc, asconf->transport); sctp_assoc_del_nonprimary_peers(asoc, asconf->transport); return SCTP_ERROR_NO_ERROR; } /* If the address is not part of the association, the * ASCONF-ACK with Error Cause Indication Parameter * which including cause of Unresolvable Address should * be sent. */ peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_rm_peer(asoc, peer); break; case SCTP_PARAM_SET_PRIMARY: /* ADDIP Section 4.2.4 * If the address 0.0.0.0 or ::0 is provided, the receiver * MAY mark the source address of the packet as its * primary. */ if (af->is_any(&addr)) memcpy(&addr, sctp_source(asconf), sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_SET_PRIMARY, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_set_primary(asoc, peer); break; } return SCTP_ERROR_NO_ERROR; } /* Verify the ASCONF packet before we process it. */ bool sctp_verify_asconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, bool addr_param_needed, struct sctp_paramhdr **errp) { struct sctp_addip_chunk *addip; bool addr_param_seen = false; union sctp_params param; addip = (struct sctp_addip_chunk *)chunk->chunk_hdr; sctp_walk_params(param, addip, addip_hdr.params) { size_t length = ntohs(param.p->length); *errp = param.p; switch (param.p->type) { case SCTP_PARAM_ERR_CAUSE: break; case SCTP_PARAM_IPV4_ADDRESS: if (length != sizeof(struct sctp_ipv4addr_param)) return false; /* ensure there is only one addr param and it's in the * beginning of addip_hdr params, or we reject it. */ if (param.v != addip->addip_hdr.params) return false; addr_param_seen = true; break; case SCTP_PARAM_IPV6_ADDRESS: if (length != sizeof(struct sctp_ipv6addr_param)) return false; if (param.v != addip->addip_hdr.params) return false; addr_param_seen = true; break; case SCTP_PARAM_ADD_IP: case SCTP_PARAM_DEL_IP: case SCTP_PARAM_SET_PRIMARY: /* In ASCONF chunks, these need to be first. */ if (addr_param_needed && !addr_param_seen) return false; length = ntohs(param.addip->param_hdr.length); if (length < sizeof(struct sctp_addip_param) + sizeof(**errp)) return false; break; case SCTP_PARAM_SUCCESS_REPORT: case SCTP_PARAM_ADAPTATION_LAYER_IND: if (length != sizeof(struct sctp_addip_param)) return false; break; default: /* This is unknown to us, reject! */ return false; } } /* Remaining sanity checks. */ if (addr_param_needed && !addr_param_seen) return false; if (!addr_param_needed && addr_param_seen) return false; if (param.v != chunk->chunk_end) return false; return true; } /* Process an incoming ASCONF chunk with the next expected serial no. and * return an ASCONF_ACK chunk to be sent in response. */ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, struct sctp_chunk *asconf) { union sctp_addr_param *addr_param; struct sctp_addip_chunk *addip; struct sctp_chunk *asconf_ack; bool all_param_pass = true; struct sctp_addiphdr *hdr; int length = 0, chunk_len; union sctp_params param; __be16 err_code; __u32 serial; addip = (struct sctp_addip_chunk *)asconf->chunk_hdr; chunk_len = ntohs(asconf->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); hdr = (struct sctp_addiphdr *)asconf->skb->data; serial = ntohl(hdr->serial); /* Skip the addiphdr and store a pointer to address parameter. */ length = sizeof(*hdr); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); chunk_len -= length; /* Skip the address parameter and store a pointer to the first * asconf parameter. */ length = ntohs(addr_param->p.length); chunk_len -= length; /* create an ASCONF_ACK chunk. * Based on the definitions of parameters, we know that the size of * ASCONF_ACK parameters are less than or equal to the fourfold of ASCONF * parameters. */ asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 4); if (!asconf_ack) goto done; /* Process the TLVs contained within the ASCONF chunk. */ sctp_walk_params(param, addip, addip_hdr.params) { /* Skip preceeding address parameters. */ if (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS) continue; err_code = sctp_process_asconf_param(asoc, asconf, param.addip); /* ADDIP 4.1 A7) * If an error response is received for a TLV parameter, * all TLVs with no response before the failed TLV are * considered successful if not reported. All TLVs after * the failed response are considered unsuccessful unless * a specific success indication is present for the parameter. */ if (err_code != SCTP_ERROR_NO_ERROR) all_param_pass = false; if (!all_param_pass) sctp_add_asconf_response(asconf_ack, param.addip->crr_id, err_code, param.addip); /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add * an IP address sends an 'Out of Resource' in its response, it * MUST also fail any subsequent add or delete requests bundled * in the ASCONF. */ if (err_code == SCTP_ERROR_RSRC_LOW) goto done; } done: asoc->peer.addip_serial++; /* If we are sending a new ASCONF_ACK hold a reference to it in assoc * after freeing the reference to old asconf ack if any. */ if (asconf_ack) { sctp_chunk_hold(asconf_ack); list_add_tail(&asconf_ack->transmitted_list, &asoc->asconf_ack_list); } return asconf_ack; } /* Process a asconf parameter that is successfully acked. */ static void sctp_asconf_param_success(struct sctp_association *asoc, struct sctp_addip_param *asconf_param) { struct sctp_bind_addr *bp = &asoc->base.bind_addr; union sctp_addr_param *addr_param; struct sctp_sockaddr_entry *saddr; struct sctp_transport *transport; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); /* We have checked the packet before, so we do not check again. */ af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0)) return; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* This is always done in BH context with a socket lock * held, so the list can not change. */ local_bh_disable(); list_for_each_entry(saddr, &bp->address_list, list) { if (sctp_cmp_addr_exact(&saddr->a, &addr)) saddr->state = SCTP_ADDR_SRC; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; case SCTP_PARAM_DEL_IP: local_bh_disable(); sctp_del_bind_addr(bp, &addr); if (asoc->asconf_addr_del_pending != NULL && sctp_cmp_addr_exact(asoc->asconf_addr_del_pending, &addr)) { kfree(asoc->asconf_addr_del_pending); asoc->asconf_addr_del_pending = NULL; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; default: break; } } /* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk * for the given asconf parameter. If there is no response for this parameter, * return the error code based on the third argument 'no_err'. * ADDIP 4.1 * A7) If an error response is received for a TLV parameter, all TLVs with no * response before the failed TLV are considered successful if not reported. * All TLVs after the failed response are considered unsuccessful unless a * specific success indication is present for the parameter. */ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack, struct sctp_addip_param *asconf_param, int no_err) { struct sctp_addip_param *asconf_ack_param; struct sctp_errhdr *err_param; int asconf_ack_len; __be16 err_code; int length; if (no_err) err_code = SCTP_ERROR_NO_ERROR; else err_code = SCTP_ERROR_REQ_REFUSED; asconf_ack_len = ntohs(asconf_ack->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); /* Skip the addiphdr from the asconf_ack chunk and store a pointer to * the first asconf_ack parameter. */ length = sizeof(struct sctp_addiphdr); asconf_ack_param = (struct sctp_addip_param *)(asconf_ack->skb->data + length); asconf_ack_len -= length; while (asconf_ack_len > 0) { if (asconf_ack_param->crr_id == asconf_param->crr_id) { switch (asconf_ack_param->param_hdr.type) { case SCTP_PARAM_SUCCESS_REPORT: return SCTP_ERROR_NO_ERROR; case SCTP_PARAM_ERR_CAUSE: length = sizeof(*asconf_ack_param); err_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; if (asconf_ack_len > 0) return err_param->cause; else return SCTP_ERROR_INV_PARAM; break; default: return SCTP_ERROR_INV_PARAM; } } length = ntohs(asconf_ack_param->param_hdr.length); asconf_ack_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; } return err_code; } /* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */ int sctp_process_asconf_ack(struct sctp_association *asoc, struct sctp_chunk *asconf_ack) { struct sctp_chunk *asconf = asoc->addip_last_asconf; struct sctp_addip_param *asconf_param; __be16 err_code = SCTP_ERROR_NO_ERROR; union sctp_addr_param *addr_param; int asconf_len = asconf->skb->len; int all_param_pass = 0; int length = 0; int no_err = 1; int retval = 0; /* Skip the chunkhdr and addiphdr from the last asconf sent and store * a pointer to address parameter. */ length = sizeof(struct sctp_addip_chunk); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); asconf_len -= length; /* Skip the address parameter in the last asconf sent and store a * pointer to the first asconf parameter. */ length = ntohs(addr_param->p.length); asconf_param = (void *)addr_param + length; asconf_len -= length; /* ADDIP 4.1 * A8) If there is no response(s) to specific TLV parameter(s), and no * failures are indicated, then all request(s) are considered * successful. */ if (asconf_ack->skb->len == sizeof(struct sctp_addiphdr)) all_param_pass = 1; /* Process the TLVs contained in the last sent ASCONF chunk. */ while (asconf_len > 0) { if (all_param_pass) err_code = SCTP_ERROR_NO_ERROR; else { err_code = sctp_get_asconf_response(asconf_ack, asconf_param, no_err); if (no_err && (SCTP_ERROR_NO_ERROR != err_code)) no_err = 0; } switch (err_code) { case SCTP_ERROR_NO_ERROR: sctp_asconf_param_success(asoc, asconf_param); break; case SCTP_ERROR_RSRC_LOW: retval = 1; break; case SCTP_ERROR_UNKNOWN_PARAM: /* Disable sending this type of asconf parameter in * future. */ asoc->peer.addip_disabled_mask |= asconf_param->param_hdr.type; break; case SCTP_ERROR_REQ_REFUSED: case SCTP_ERROR_DEL_LAST_IP: case SCTP_ERROR_DEL_SRC_IP: default: break; } /* Skip the processed asconf parameter and move to the next * one. */ length = ntohs(asconf_param->param_hdr.length); asconf_param = (void *)asconf_param + length; asconf_len -= length; } if (no_err && asoc->src_out_of_asoc_ok) { asoc->src_out_of_asoc_ok = 0; sctp_transport_immediate_rtx(asoc->peer.primary_path); } /* Free the cached last sent asconf chunk. */ list_del_init(&asconf->transmitted_list); sctp_chunk_free(asconf); asoc->addip_last_asconf = NULL; return retval; } /* Make a FWD TSN chunk. */ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_fwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_fwdtsn_hdr ftsn_hdr; struct sctp_fwdtsn_skip skip; size_t hint; int i; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.fwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); for (i = 0; i < nstreams; i++) { skip.stream = skiplist[i].stream; skip.ssn = skiplist[i].ssn; sctp_addto_chunk(retval, sizeof(skip), &skip); } return retval; } struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_ifwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_ifwdtsn_hdr ftsn_hdr; size_t hint; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.ifwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist); return retval; } /* RE-CONFIG 3.1 (RE-CONFIG chunk) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 130 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter (optional) / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc, int length) { struct sctp_reconf_chunk *reconf; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr; retval->param_hdr.v = reconf->params; return retval; } /* RE-CONFIG 4.1 (STREAM OUT RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 13 | Parameter Length = 16 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Last Assigned TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * RE-CONFIG 4.2 (STREAM IN RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 14 | Parameter Length = 8 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_req( const struct sctp_association *asoc, __u16 stream_num, __be16 *stream_list, bool out, bool in) { __u16 stream_len = stream_num * sizeof(__u16); struct sctp_strreset_outreq outreq; struct sctp_strreset_inreq inreq; struct sctp_chunk *retval; __u16 outlen, inlen; outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; if (outlen) { outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST; outreq.param_hdr.length = htons(outlen); outreq.request_seq = htonl(asoc->strreset_outseq); outreq.response_seq = htonl(asoc->strreset_inseq - 1); outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1); sctp_addto_chunk(retval, sizeof(outreq), &outreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } if (inlen) { inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST; inreq.param_hdr.length = htons(inlen); inreq.request_seq = htonl(asoc->strreset_outseq + out); sctp_addto_chunk(retval, sizeof(inreq), &inreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } return retval; } /* RE-CONFIG 4.3 (SSN/TSN RESET ALL) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 15 | Parameter Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnreq( const struct sctp_association *asoc) { struct sctp_strreset_tsnreq tsnreq; __u16 length = sizeof(tsnreq); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST; tsnreq.param_hdr.length = htons(length); tsnreq.request_seq = htonl(asoc->strreset_outseq); sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq); return retval; } /* RE-CONFIG 4.5/4.6 (ADD STREAM) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 17 | Parameter Length = 12 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of new streams | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_addstrm( const struct sctp_association *asoc, __u16 out, __u16 in) { struct sctp_strreset_addstrm addstrm; __u16 size = sizeof(addstrm); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, (!!out + !!in) * size); if (!retval) return NULL; if (out) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(out); addstrm.request_seq = htonl(asoc->strreset_outseq); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } if (in) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(in); addstrm.request_seq = htonl(asoc->strreset_outseq + !!out); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } return retval; } /* RE-CONFIG 4.4 (RESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc, __u32 result, __u32 sn) { struct sctp_strreset_resp resp; __u16 length = sizeof(resp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; resp.param_hdr.length = htons(length); resp.response_seq = htonl(sn); resp.result = htonl(result); sctp_addto_chunk(retval, sizeof(resp), &resp); return retval; } /* RE-CONFIG 4.4 OPTIONAL (TSNRESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Receiver's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc, __u32 result, __u32 sn, __u32 sender_tsn, __u32 receiver_tsn) { struct sctp_strreset_resptsn tsnresp; __u16 length = sizeof(tsnresp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; tsnresp.param_hdr.length = htons(length); tsnresp.response_seq = htonl(sn); tsnresp.result = htonl(result); tsnresp.senders_next_tsn = htonl(sender_tsn); tsnresp.receivers_next_tsn = htonl(receiver_tsn); sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp); return retval; } bool sctp_verify_reconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_paramhdr **errp) { struct sctp_reconf_chunk *hdr; union sctp_params param; __be16 last = 0; __u16 cnt = 0; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr, params) { __u16 length = ntohs(param.p->length); *errp = param.p; if (cnt++ > 2) return false; switch (param.p->type) { case SCTP_PARAM_RESET_OUT_REQUEST: if (length < sizeof(struct sctp_strreset_outreq) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_IN_REQUEST)) return false; break; case SCTP_PARAM_RESET_IN_REQUEST: if (length < sizeof(struct sctp_strreset_inreq) || (last && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_RESPONSE: if ((length != sizeof(struct sctp_strreset_resp) && length != sizeof(struct sctp_strreset_resptsn)) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_TSN_REQUEST: if (length != sizeof(struct sctp_strreset_tsnreq) || last) return false; break; case SCTP_PARAM_RESET_ADD_IN_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS)) return false; break; case SCTP_PARAM_RESET_ADD_OUT_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS)) return false; break; default: return false; } last = param.p->type; } return true; } |
50 50 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_ipt.c iptables target interface * *TODO: Add other tables. For now we only support the ipv4 table targets * * Copyright: Jamal Hadi Salim (2002-13) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_ipt.h> #include <net/tc_act/tc_ipt.h> #include <linux/netfilter_ipv4/ip_tables.h> static unsigned int ipt_net_id; static struct tc_action_ops act_ipt_ops; static unsigned int xt_net_id; static struct tc_action_ops act_xt_ops; static int ipt_init_target(struct net *net, struct xt_entry_target *t, char *table, unsigned int hook) { struct xt_tgchk_param par; struct xt_target *target; struct ipt_entry e = {}; int ret = 0; target = xt_request_find_target(AF_INET, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) return PTR_ERR(target); t->u.kernel.target = target; memset(&par, 0, sizeof(par)); par.net = net; par.table = table; par.entryinfo = &e; par.target = target; par.targinfo = t->data; par.hook_mask = 1 << hook; par.family = NFPROTO_IPV4; ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); if (ret < 0) { module_put(t->u.kernel.target->me); return ret; } return 0; } static void ipt_destroy_target(struct xt_entry_target *t, struct net *net) { struct xt_tgdtor_param par = { .target = t->u.kernel.target, .targinfo = t->data, .family = NFPROTO_IPV4, .net = net, }; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); } static void tcf_ipt_release(struct tc_action *a) { struct tcf_ipt *ipt = to_ipt(a); if (ipt->tcfi_t) { ipt_destroy_target(ipt->tcfi_t, a->idrinfo->net); kfree(ipt->tcfi_t); } kfree(ipt->tcfi_tname); } static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_IPT_HOOK] = NLA_POLICY_RANGE(NLA_U32, NF_INET_PRE_ROUTING, NF_INET_NUMHOOKS), [TCA_IPT_INDEX] = { .type = NLA_U32 }, [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, struct tcf_proto *tp, u32 flags) { struct tc_action_net *tn = net_generic(net, id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct xt_entry_target *td, *t; char *tname; bool exists = false; int ret = 0, err; u32 hook = 0; u32 index = 0; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_IPT_MAX, nla, ipt_policy, NULL); if (err < 0) return err; if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return 0; if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); if (nla_len(tb[TCA_IPT_TARG]) != td->u.target_size) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, false, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ return 0; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } } err = -EINVAL; hook = nla_get_u32(tb[TCA_IPT_HOOK]); switch (hook) { case NF_INET_PRE_ROUTING: break; case NF_INET_POST_ROUTING: break; default: goto err1; } if (tb[TCA_IPT_TABLE]) { /* mangle only for now */ if (nla_strcmp(tb[TCA_IPT_TABLE], "mangle")) goto err1; } tname = kstrdup("mangle", GFP_KERNEL); if (unlikely(!tname)) goto err1; t = kmemdup(td, td->u.target_size, GFP_KERNEL); if (unlikely(!t)) goto err2; err = ipt_init_target(net, t, tname, hook); if (err < 0) goto err3; ipt = to_ipt(*a); spin_lock_bh(&ipt->tcf_lock); if (ret != ACT_P_CREATED) { ipt_destroy_target(ipt->tcfi_t, net); kfree(ipt->tcfi_tname); kfree(ipt->tcfi_t); } ipt->tcfi_tname = tname; ipt->tcfi_t = t; ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); return ret; err3: kfree(t); err2: kfree(tname); err1: tcf_idr_release(*a, bind); return err; } static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, tp, flags); } static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, tp, flags); } static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { int ret = 0, result = 0; struct tcf_ipt *ipt = to_ipt(a); struct xt_action_param par; struct nf_hook_state state = { .net = dev_net(skb->dev), .in = skb->dev, .hook = ipt->tcfi_hook, .pf = NFPROTO_IPV4, }; if (skb_unclone(skb, GFP_ATOMIC)) return TC_ACT_UNSPEC; spin_lock(&ipt->tcf_lock); tcf_lastuse_update(&ipt->tcf_tm); bstats_update(&ipt->tcf_bstats, skb); /* yes, we have to worry about both in and out dev * worry later - danger - this API seems to have changed * from earlier kernels */ par.state = &state; par.target = ipt->tcfi_t->u.kernel.target; par.targinfo = ipt->tcfi_t->data; ret = par.target->target(skb, &par); switch (ret) { case NF_ACCEPT: result = TC_ACT_OK; break; case NF_DROP: result = TC_ACT_SHOT; ipt->tcf_qstats.drops++; break; case XT_CONTINUE: result = TC_ACT_PIPE; break; default: net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n", ret); result = TC_ACT_OK; break; } spin_unlock(&ipt->tcf_lock); return result; } static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_ipt *ipt = to_ipt(a); struct xt_entry_target *t; struct tcf_t tm; struct tc_cnt c; /* for simple targets kernel size == user size * user name = target name * for foolproof you need to not assume this */ spin_lock_bh(&ipt->tcf_lock); t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); if (unlikely(!t)) goto nla_put_failure; c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind; c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref; strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) || nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) || nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) || nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname)) goto nla_put_failure; tcf_tm_dump(&tm, &ipt->tcf_tm); if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; spin_unlock_bh(&ipt->tcf_lock); kfree(t); return skb->len; nla_put_failure: spin_unlock_bh(&ipt->tcf_lock); nlmsg_trim(skb, b); kfree(t); return -1; } static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ipt_net_id); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ipt_net_id); return tcf_idr_search(tn, a, index); } static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .id = TCA_ID_IPT, .owner = THIS_MODULE, .act = tcf_ipt_act, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_ipt_init, .walk = tcf_ipt_walker, .lookup = tcf_ipt_search, .size = sizeof(struct tcf_ipt), }; static __net_init int ipt_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, ipt_net_id); return tc_action_net_init(net, tn, &act_ipt_ops); } static void __net_exit ipt_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, ipt_net_id); } static struct pernet_operations ipt_net_ops = { .init = ipt_init_net, .exit_batch = ipt_exit_net, .id = &ipt_net_id, .size = sizeof(struct tc_action_net), }; static int tcf_xt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, xt_net_id); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, xt_net_id); return tcf_idr_search(tn, a, index); } static struct tc_action_ops act_xt_ops = { .kind = "xt", .id = TCA_ID_XT, .owner = THIS_MODULE, .act = tcf_ipt_act, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_xt_init, .walk = tcf_xt_walker, .lookup = tcf_xt_search, .size = sizeof(struct tcf_ipt), }; static __net_init int xt_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, xt_net_id); return tc_action_net_init(net, tn, &act_xt_ops); } static void __net_exit xt_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, xt_net_id); } static struct pernet_operations xt_net_ops = { .init = xt_init_net, .exit_batch = xt_exit_net, .id = &xt_net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); MODULE_DESCRIPTION("Iptables target actions"); MODULE_LICENSE("GPL"); MODULE_ALIAS("act_xt"); static int __init ipt_init_module(void) { int ret1, ret2; ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops); if (ret1 < 0) pr_err("Failed to load xt action\n"); ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops); if (ret2 < 0) pr_err("Failed to load ipt action\n"); if (ret1 < 0 && ret2 < 0) { return ret1; } else return 0; } static void __exit ipt_cleanup_module(void) { tcf_unregister_action(&act_ipt_ops, &ipt_net_ops); tcf_unregister_action(&act_xt_ops, &xt_net_ops); } module_init(ipt_init_module); module_exit(ipt_cleanup_module); |
312 313 1 7 1 1 7 256 255 254 890 890 256 1 1 1 1 1 8 8 7 7 7 7 7 1 1 1 1 2 2 3 3 49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 | // SPDX-License-Identifier: GPL-2.0-or-later /** -*- linux-c -*- *********************************************************** * Linux PPP over Ethernet (PPPoX/PPPoE) Sockets * * PPPoX --- Generic PPP encapsulation socket family * PPPoE --- PPP over Ethernet (RFC 2516) * * Version: 0.7.0 * * 070228 : Fix to allow multiple sessions with same remote MAC and same * session id by including the local device ifindex in the * tuple identifying a session. This also ensures packets can't * be injected into a session from interfaces other than the one * specified by userspace. Florian Zumbiehl <florz@florz.de> * (Oh, BTW, this one is YYMMDD, in case you were wondering ...) * 220102 : Fix module use count on failure in pppoe_create, pppox_sk -acme * 030700 : Fixed connect logic to allow for disconnect. * 270700 : Fixed potential SMP problems; we must protect against * simultaneous invocation of ppp_input * and ppp_unregister_channel. * 040800 : Respect reference count mechanisms on net-devices. * 200800 : fix kfree(skb) in pppoe_rcv (acme) * Module reference count is decremented in the right spot now, * guards against sock_put not actually freeing the sk * in pppoe_release. * 051000 : Initialization cleanup. * 111100 : Fix recvmsg. * 050101 : Fix PADT processing. * 140501 : Use pppoe_rcv_core to handle all backlog. (Alexey) * 170701 : Do not lock_sock with rwlock held. (DaveM) * Ignore discovery frames if user has socket * locked. (DaveM) * Ignore return value of dev_queue_xmit in __pppoe_xmit * or else we may kfree an SKB twice. (DaveM) * 190701 : When doing copies of skb's in __pppoe_xmit, always delete * the original skb that was passed in on success, never on * failure. Delete the copy of the skb on failure to avoid * a memory leak. * 081001 : Misc. cleanup (licence string, non-blocking, prevent * reference of device on close). * 121301 : New ppp channels interface; cannot unregister a channel * from interrupts. Thus, we mark the socket as a ZOMBIE * and do the unregistration later. * 081002 : seq_file support for proc stuff -acme * 111602 : Merge all 2.4 fixes into 2.5/2.6 tree. Label 2.5/2.6 * as version 0.7. Spacing cleanup. * Author: Michal Ostrowski <mostrows@speakeasy.net> * Contributors: * Arnaldo Carvalho de Melo <acme@conectiva.com.br> * David S. Miller (davem@redhat.com) * * License: */ #include <linux/string.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/if_ether.h> #include <linux/if_pppox.h> #include <linux/ppp_channel.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/notifier.h> #include <linux/file.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <linux/uaccess.h> #define PPPOE_HASH_BITS 4 #define PPPOE_HASH_SIZE (1 << PPPOE_HASH_BITS) #define PPPOE_HASH_MASK (PPPOE_HASH_SIZE - 1) static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb); static const struct proto_ops pppoe_ops; static const struct ppp_channel_ops pppoe_chan_ops; /* per-net private data for this module */ static unsigned int pppoe_net_id __read_mostly; struct pppoe_net { /* * we could use _single_ hash table for all * nets by injecting net id into the hash but * it would increase hash chains and add * a few additional math comparisons messy * as well, moreover in case of SMP less locking * controversy here */ struct pppox_sock *hash_table[PPPOE_HASH_SIZE]; rwlock_t hash_lock; }; /* * PPPoE could be in the following stages: * 1) Discovery stage (to obtain remote MAC and Session ID) * 2) Session stage (MAC and SID are known) * * Ethernet frames have a special tag for this but * we use simpler approach based on session id */ static inline bool stage_session(__be16 sid) { return sid != 0; } static inline struct pppoe_net *pppoe_pernet(struct net *net) { return net_generic(net, pppoe_net_id); } static inline int cmp_2_addr(struct pppoe_addr *a, struct pppoe_addr *b) { return a->sid == b->sid && ether_addr_equal(a->remote, b->remote); } static inline int cmp_addr(struct pppoe_addr *a, __be16 sid, char *addr) { return a->sid == sid && ether_addr_equal(a->remote, addr); } #if 8 % PPPOE_HASH_BITS #error 8 must be a multiple of PPPOE_HASH_BITS #endif static int hash_item(__be16 sid, unsigned char *addr) { unsigned char hash = 0; unsigned int i; for (i = 0; i < ETH_ALEN; i++) hash ^= addr[i]; for (i = 0; i < sizeof(sid_t) * 8; i += 8) hash ^= (__force __u32)sid >> i; for (i = 8; (i >>= 1) >= PPPOE_HASH_BITS;) hash ^= hash >> i; return hash & PPPOE_HASH_MASK; } /********************************************************************** * * Set/get/delete/rehash items (internal versions) * **********************************************************************/ static struct pppox_sock *__get_item(struct pppoe_net *pn, __be16 sid, unsigned char *addr, int ifindex) { int hash = hash_item(sid, addr); struct pppox_sock *ret; ret = pn->hash_table[hash]; while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) return ret; ret = ret->next; } return NULL; } static int __set_item(struct pppoe_net *pn, struct pppox_sock *po) { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); struct pppox_sock *ret; ret = pn->hash_table[hash]; while (ret) { if (cmp_2_addr(&ret->pppoe_pa, &po->pppoe_pa) && ret->pppoe_ifindex == po->pppoe_ifindex) return -EALREADY; ret = ret->next; } po->next = pn->hash_table[hash]; pn->hash_table[hash] = po; return 0; } static void __delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { int hash = hash_item(sid, addr); struct pppox_sock *ret, **src; ret = pn->hash_table[hash]; src = &pn->hash_table[hash]; while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) { *src = ret->next; break; } src = &ret->next; ret = ret->next; } } /********************************************************************** * * Set/get/delete/rehash items * **********************************************************************/ static inline struct pppox_sock *get_item(struct pppoe_net *pn, __be16 sid, unsigned char *addr, int ifindex) { struct pppox_sock *po; read_lock_bh(&pn->hash_lock); po = __get_item(pn, sid, addr, ifindex); if (po) sock_hold(sk_pppox(po)); read_unlock_bh(&pn->hash_lock); return po; } static inline struct pppox_sock *get_item_by_addr(struct net *net, struct sockaddr_pppox *sp) { struct net_device *dev; struct pppoe_net *pn; struct pppox_sock *pppox_sock = NULL; int ifindex; rcu_read_lock(); dev = dev_get_by_name_rcu(net, sp->sa_addr.pppoe.dev); if (dev) { ifindex = dev->ifindex; pn = pppoe_pernet(net); pppox_sock = get_item(pn, sp->sa_addr.pppoe.sid, sp->sa_addr.pppoe.remote, ifindex); } rcu_read_unlock(); return pppox_sock; } static inline void delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { write_lock_bh(&pn->hash_lock); __delete_item(pn, sid, addr, ifindex); write_unlock_bh(&pn->hash_lock); } /*************************************************************************** * * Handler for device events. * Certain device events require that sockets be unconnected. * **************************************************************************/ static void pppoe_flush_dev(struct net_device *dev) { struct pppoe_net *pn; int i; pn = pppoe_pernet(dev_net(dev)); write_lock_bh(&pn->hash_lock); for (i = 0; i < PPPOE_HASH_SIZE; i++) { struct pppox_sock *po = pn->hash_table[i]; struct sock *sk; while (po) { while (po && po->pppoe_dev != dev) { po = po->next; } if (!po) break; sk = sk_pppox(po); /* We always grab the socket lock, followed by the * hash_lock, in that order. Since we should hold the * sock lock while doing any unbinding, we need to * release the lock we're holding. Hold a reference to * the sock so it doesn't disappear as we're jumping * between locks. */ sock_hold(sk); write_unlock_bh(&pn->hash_lock); lock_sock(sk); if (po->pppoe_dev == dev && sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) { pppox_unbind_sock(sk); sk->sk_state_change(sk); po->pppoe_dev = NULL; dev_put(dev); } release_sock(sk); sock_put(sk); /* Restart the process from the start of the current * hash chain. We dropped locks so the world may have * change from underneath us. */ BUG_ON(pppoe_pernet(dev_net(dev)) == NULL); write_lock_bh(&pn->hash_lock); po = pn->hash_table[i]; } } write_unlock_bh(&pn->hash_lock); } static int pppoe_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Only look at sockets that are using this specific device. */ switch (event) { case NETDEV_CHANGEADDR: case NETDEV_CHANGEMTU: /* A change in mtu or address is a bad thing, requiring * LCP re-negotiation. */ case NETDEV_GOING_DOWN: case NETDEV_DOWN: /* Find every socket on this device and kill it. */ pppoe_flush_dev(dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block pppoe_notifier = { .notifier_call = pppoe_device_event, }; /************************************************************************ * * Do the real work of receiving a PPPoE Session frame. * ***********************************************************************/ static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct pppox_sock *relay_po; /* Backlog receive. Semantics of backlog rcv preclude any code from * executing in lock_sock()/release_sock() bounds; meaning sk->sk_state * can't change. */ if (skb->pkt_type == PACKET_OTHERHOST) goto abort_kfree; if (sk->sk_state & PPPOX_BOUND) { ppp_input(&po->chan, skb); } else if (sk->sk_state & PPPOX_RELAY) { relay_po = get_item_by_addr(sock_net(sk), &po->pppoe_relay); if (relay_po == NULL) goto abort_kfree; if ((sk_pppox(relay_po)->sk_state & PPPOX_CONNECTED) == 0) goto abort_put; if (!__pppoe_xmit(sk_pppox(relay_po), skb)) goto abort_put; sock_put(sk_pppox(relay_po)); } else { if (sock_queue_rcv_skb(sk, skb)) goto abort_kfree; } return NET_RX_SUCCESS; abort_put: sock_put(sk_pppox(relay_po)); abort_kfree: kfree_skb(skb); return NET_RX_DROP; } /************************************************************************ * * Receive wrapper called in BH context. * ***********************************************************************/ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct pppoe_hdr *ph; struct pppox_sock *po; struct pppoe_net *pn; int len; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; if (skb_mac_header_len(skb) < ETH_HLEN) goto drop; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto drop; ph = pppoe_hdr(skb); len = ntohs(ph->length); skb_pull_rcsum(skb, sizeof(*ph)); if (skb->len < len) goto drop; if (pskb_trim_rcsum(skb, len)) goto drop; ph = pppoe_hdr(skb); pn = pppoe_pernet(dev_net(dev)); /* Note that get_item does a sock_hold(), so sk_pppox(po) * is known to be safe. */ po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex); if (!po) goto drop; return sk_receive_skb(sk_pppox(po), skb, 0); drop: kfree_skb(skb); out: return NET_RX_DROP; } static void pppoe_unbind_sock_work(struct work_struct *work) { struct pppox_sock *po = container_of(work, struct pppox_sock, proto.pppoe.padt_work); struct sock *sk = sk_pppox(po); lock_sock(sk); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } pppox_unbind_sock(sk); release_sock(sk); sock_put(sk); } /************************************************************************ * * Receive a PPPoE Discovery frame. * This is solely for detection of PADT frames * ***********************************************************************/ static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct pppoe_hdr *ph; struct pppox_sock *po; struct pppoe_net *pn; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; if (skb->pkt_type != PACKET_HOST) goto abort; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto abort; ph = pppoe_hdr(skb); if (ph->code != PADT_CODE) goto abort; pn = pppoe_pernet(dev_net(dev)); po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex); if (po) if (!schedule_work(&po->proto.pppoe.padt_work)) sock_put(sk_pppox(po)); abort: kfree_skb(skb); out: return NET_RX_SUCCESS; /* Lies... :-) */ } static struct packet_type pppoes_ptype __read_mostly = { .type = cpu_to_be16(ETH_P_PPP_SES), .func = pppoe_rcv, }; static struct packet_type pppoed_ptype __read_mostly = { .type = cpu_to_be16(ETH_P_PPP_DISC), .func = pppoe_disc_rcv, }; static struct proto pppoe_sk_proto __read_mostly = { .name = "PPPOE", .owner = THIS_MODULE, .obj_size = sizeof(struct pppox_sock), }; /*********************************************************************** * * Initialize a new struct sock. * **********************************************************************/ static int pppoe_create(struct net *net, struct socket *sock, int kern) { struct sock *sk; sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->state = SS_UNCONNECTED; sock->ops = &pppoe_ops; sk->sk_backlog_rcv = pppoe_rcv_core; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_family = PF_PPPOX; sk->sk_protocol = PX_PROTO_OE; INIT_WORK(&pppox_sk(sk)->proto.pppoe.padt_work, pppoe_unbind_sock_work); return 0; } static int pppoe_release(struct socket *sock) { struct sock *sk = sock->sk; struct pppox_sock *po; struct pppoe_net *pn; struct net *net = NULL; if (!sk) return 0; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { release_sock(sk); return -EBADF; } po = pppox_sk(sk); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } pppox_unbind_sock(sk); /* Signal the death of the socket. */ sk->sk_state = PPPOX_DEAD; net = sock_net(sk); pn = pppoe_pernet(net); /* * protect "po" from concurrent updates * on pppoe_flush_dev */ delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); sock_orphan(sk); sock->sk = NULL; skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *)uservaddr; struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = NULL; struct pppoe_net *pn; struct net *net = NULL; int error; lock_sock(sk); error = -EINVAL; if (sockaddr_len != sizeof(struct sockaddr_pppox)) goto end; if (sp->sa_protocol != PX_PROTO_OE) goto end; /* Check for already bound sockets */ error = -EBUSY; if ((sk->sk_state & PPPOX_CONNECTED) && stage_session(sp->sa_addr.pppoe.sid)) goto end; /* Check for already disconnected sockets, on attempts to disconnect */ error = -EALREADY; if ((sk->sk_state & PPPOX_DEAD) && !stage_session(sp->sa_addr.pppoe.sid)) goto end; error = 0; /* Delete the old binding */ if (stage_session(po->pppoe_pa.sid)) { pppox_unbind_sock(sk); pn = pppoe_pernet(sock_net(sk)); delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } po->pppoe_ifindex = 0; memset(&po->pppoe_pa, 0, sizeof(po->pppoe_pa)); memset(&po->pppoe_relay, 0, sizeof(po->pppoe_relay)); memset(&po->chan, 0, sizeof(po->chan)); po->next = NULL; po->num = 0; sk->sk_state = PPPOX_NONE; } /* Re-bind in session stage only */ if (stage_session(sp->sa_addr.pppoe.sid)) { error = -ENODEV; net = sock_net(sk); dev = dev_get_by_name(net, sp->sa_addr.pppoe.dev); if (!dev) goto err_put; po->pppoe_dev = dev; po->pppoe_ifindex = dev->ifindex; pn = pppoe_pernet(net); if (!(dev->flags & IFF_UP)) { goto err_put; } memcpy(&po->pppoe_pa, &sp->sa_addr.pppoe, sizeof(struct pppoe_addr)); write_lock_bh(&pn->hash_lock); error = __set_item(pn, po); write_unlock_bh(&pn->hash_lock); if (error < 0) goto err_put; po->chan.hdrlen = (sizeof(struct pppoe_hdr) + dev->hard_header_len); po->chan.mtu = dev->mtu - sizeof(struct pppoe_hdr) - 2; po->chan.private = sk; po->chan.ops = &pppoe_chan_ops; error = ppp_register_net_channel(dev_net(dev), &po->chan); if (error) { delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); goto err_put; } sk->sk_state = PPPOX_CONNECTED; } po->num = sp->sa_addr.pppoe.sid; end: release_sock(sk); return error; err_put: if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } goto end; } static int pppoe_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int len = sizeof(struct sockaddr_pppox); struct sockaddr_pppox sp; sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OE; memcpy(&sp.sa_addr.pppoe, &pppox_sk(sock->sk)->pppoe_pa, sizeof(struct pppoe_addr)); memcpy(uaddr, &sp, len); return len; } static int pppoe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int val; int err; switch (cmd) { case PPPIOCGMRU: err = -ENXIO; if (!(sk->sk_state & PPPOX_CONNECTED)) break; err = -EFAULT; if (put_user(po->pppoe_dev->mtu - sizeof(struct pppoe_hdr) - PPP_HDRLEN, (int __user *)arg)) break; err = 0; break; case PPPIOCSMRU: err = -ENXIO; if (!(sk->sk_state & PPPOX_CONNECTED)) break; err = -EFAULT; if (get_user(val, (int __user *)arg)) break; if (val < (po->pppoe_dev->mtu - sizeof(struct pppoe_hdr) - PPP_HDRLEN)) err = 0; else err = -EINVAL; break; case PPPIOCSFLAGS: err = -EFAULT; if (get_user(val, (int __user *)arg)) break; err = 0; break; case PPPOEIOCSFWD: { struct pppox_sock *relay_po; err = -EBUSY; if (sk->sk_state & (PPPOX_BOUND | PPPOX_DEAD)) break; err = -ENOTCONN; if (!(sk->sk_state & PPPOX_CONNECTED)) break; /* PPPoE address from the user specifies an outbound PPPoE address which frames are forwarded to */ err = -EFAULT; if (copy_from_user(&po->pppoe_relay, (void __user *)arg, sizeof(struct sockaddr_pppox))) break; err = -EINVAL; if (po->pppoe_relay.sa_family != AF_PPPOX || po->pppoe_relay.sa_protocol != PX_PROTO_OE) break; /* Check that the socket referenced by the address actually exists. */ relay_po = get_item_by_addr(sock_net(sk), &po->pppoe_relay); if (!relay_po) break; sock_put(sk_pppox(relay_po)); sk->sk_state |= PPPOX_RELAY; err = 0; break; } case PPPOEIOCDFWD: err = -EALREADY; if (!(sk->sk_state & PPPOX_RELAY)) break; sk->sk_state &= ~PPPOX_RELAY; err = 0; break; default: err = -ENOTTY; } return err; } static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct sk_buff *skb; struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int error; struct pppoe_hdr hdr; struct pppoe_hdr *ph; struct net_device *dev; char *start; int hlen; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) { error = -ENOTCONN; goto end; } hdr.ver = 1; hdr.type = 1; hdr.code = 0; hdr.sid = po->num; dev = po->pppoe_dev; error = -EMSGSIZE; if (total_len > (dev->mtu + dev->hard_header_len)) goto end; hlen = LL_RESERVED_SPACE(dev); skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len + dev->needed_tailroom, 0, GFP_KERNEL); if (!skb) { error = -ENOMEM; goto end; } /* Reserve space for headers. */ skb_reserve(skb, hlen); skb_reset_network_header(skb); skb->dev = dev; skb->priority = sk->sk_priority; skb->protocol = cpu_to_be16(ETH_P_PPP_SES); ph = skb_put(skb, total_len + sizeof(struct pppoe_hdr)); start = (char *)&ph->tag[0]; error = memcpy_from_msg(start, m, total_len); if (error < 0) { kfree_skb(skb); goto end; } error = total_len; dev_hard_header(skb, dev, ETH_P_PPP_SES, po->pppoe_pa.remote, NULL, total_len); memcpy(ph, &hdr, sizeof(struct pppoe_hdr)); ph->length = htons(total_len); dev_queue_xmit(skb); end: release_sock(sk); return error; } /************************************************************************ * * xmit function for internal use. * ***********************************************************************/ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = po->pppoe_dev; struct pppoe_hdr *ph; int data_len = skb->len; /* The higher-level PPP code (ppp_unregister_channel()) ensures the PPP * xmit operations conclude prior to an unregistration call. Thus * sk->sk_state cannot change, so we don't need to do lock_sock(). * But, we also can't do a lock_sock since that introduces a potential * deadlock as we'd reverse the lock ordering used when calling * ppp_unregister_channel(). */ if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) goto abort; if (!dev) goto abort; /* Copy the data if there is no space for the header or if it's * read-only. */ if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph))) goto abort; __skb_push(skb, sizeof(*ph)); skb_reset_network_header(skb); ph = pppoe_hdr(skb); ph->ver = 1; ph->type = 1; ph->code = 0; ph->sid = po->num; ph->length = htons(data_len); skb->protocol = cpu_to_be16(ETH_P_PPP_SES); skb->dev = dev; dev_hard_header(skb, dev, ETH_P_PPP_SES, po->pppoe_pa.remote, NULL, data_len); dev_queue_xmit(skb); return 1; abort: kfree_skb(skb); return 1; } /************************************************************************ * * xmit function called by generic PPP driver * sends PPP frame over PPPoE socket * ***********************************************************************/ static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = (struct sock *)chan->private; return __pppoe_xmit(sk, skb); } static int pppoe_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path, const struct ppp_channel *chan) { struct sock *sk = (struct sock *)chan->private; struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = po->pppoe_dev; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED) || !dev) return -1; path->type = DEV_PATH_PPPOE; path->encap.proto = htons(ETH_P_PPP_SES); path->encap.id = be16_to_cpu(po->num); memcpy(path->encap.h_dest, po->pppoe_pa.remote, ETH_ALEN); memcpy(ctx->daddr, po->pppoe_pa.remote, ETH_ALEN); path->dev = ctx->dev; ctx->dev = dev; return 0; } static const struct ppp_channel_ops pppoe_chan_ops = { .start_xmit = pppoe_xmit, .fill_forward_path = pppoe_fill_forward_path, }; static int pppoe_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int error = 0; if (sk->sk_state & PPPOX_BOUND) { error = -EIO; goto end; } skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &error); if (error < 0) goto end; if (skb) { total_len = min_t(size_t, total_len, skb->len); error = skb_copy_datagram_msg(skb, 0, m, total_len); if (error == 0) { consume_skb(skb); return total_len; } } kfree_skb(skb); end: return error; } #ifdef CONFIG_PROC_FS static int pppoe_seq_show(struct seq_file *seq, void *v) { struct pppox_sock *po; char *dev_name; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Id Address Device\n"); goto out; } po = v; dev_name = po->pppoe_pa.dev; seq_printf(seq, "%08X %pM %8s\n", po->pppoe_pa.sid, po->pppoe_pa.remote, dev_name); out: return 0; } static inline struct pppox_sock *pppoe_get_idx(struct pppoe_net *pn, loff_t pos) { struct pppox_sock *po; int i; for (i = 0; i < PPPOE_HASH_SIZE; i++) { po = pn->hash_table[i]; while (po) { if (!pos--) goto out; po = po->next; } } out: return po; } static void *pppoe_seq_start(struct seq_file *seq, loff_t *pos) __acquires(pn->hash_lock) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); loff_t l = *pos; read_lock_bh(&pn->hash_lock); return l ? pppoe_get_idx(pn, --l) : SEQ_START_TOKEN; } static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); struct pppox_sock *po; ++*pos; if (v == SEQ_START_TOKEN) { po = pppoe_get_idx(pn, 0); goto out; } po = v; if (po->next) po = po->next; else { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); po = NULL; while (++hash < PPPOE_HASH_SIZE) { po = pn->hash_table[hash]; if (po) break; } } out: return po; } static void pppoe_seq_stop(struct seq_file *seq, void *v) __releases(pn->hash_lock) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); read_unlock_bh(&pn->hash_lock); } static const struct seq_operations pppoe_seq_ops = { .start = pppoe_seq_start, .next = pppoe_seq_next, .stop = pppoe_seq_stop, .show = pppoe_seq_show, }; #endif /* CONFIG_PROC_FS */ static const struct proto_ops pppoe_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pppoe_release, .bind = sock_no_bind, .connect = pppoe_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppoe_getname, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = pppoe_sendmsg, .recvmsg = pppoe_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pppox_compat_ioctl, #endif }; static const struct pppox_proto pppoe_proto = { .create = pppoe_create, .ioctl = pppoe_ioctl, .owner = THIS_MODULE, }; static __net_init int pppoe_init_net(struct net *net) { struct pppoe_net *pn = pppoe_pernet(net); struct proc_dir_entry *pde; rwlock_init(&pn->hash_lock); pde = proc_create_net("pppoe", 0444, net->proc_net, &pppoe_seq_ops, sizeof(struct seq_net_private)); #ifdef CONFIG_PROC_FS if (!pde) return -ENOMEM; #endif return 0; } static __net_exit void pppoe_exit_net(struct net *net) { remove_proc_entry("pppoe", net->proc_net); } static struct pernet_operations pppoe_net_ops = { .init = pppoe_init_net, .exit = pppoe_exit_net, .id = &pppoe_net_id, .size = sizeof(struct pppoe_net), }; static int __init pppoe_init(void) { int err; err = register_pernet_device(&pppoe_net_ops); if (err) goto out; err = proto_register(&pppoe_sk_proto, 0); if (err) goto out_unregister_net_ops; err = register_pppox_proto(PX_PROTO_OE, &pppoe_proto); if (err) goto out_unregister_pppoe_proto; dev_add_pack(&pppoes_ptype); dev_add_pack(&pppoed_ptype); register_netdevice_notifier(&pppoe_notifier); return 0; out_unregister_pppoe_proto: proto_unregister(&pppoe_sk_proto); out_unregister_net_ops: unregister_pernet_device(&pppoe_net_ops); out: return err; } static void __exit pppoe_exit(void) { unregister_netdevice_notifier(&pppoe_notifier); dev_remove_pack(&pppoed_ptype); dev_remove_pack(&pppoes_ptype); unregister_pppox_proto(PX_PROTO_OE); proto_unregister(&pppoe_sk_proto); unregister_pernet_device(&pppoe_net_ops); } module_init(pppoe_init); module_exit(pppoe_exit); MODULE_AUTHOR("Michal Ostrowski <mostrows@speakeasy.net>"); MODULE_DESCRIPTION("PPP over Ethernet driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OE); |
484 484 107 108 108 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 | // SPDX-License-Identifier: GPL-2.0 /* * property.c - Unified device property interface. * * Copyright (C) 2014, Intel Corporation * Authors: Rafael J. Wysocki <rafael.j.wysocki@intel.com> * Mika Westerberg <mika.westerberg@linux.intel.com> */ #include <linux/acpi.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/of.h> #include <linux/of_address.h> #include <linux/of_graph.h> #include <linux/of_irq.h> #include <linux/property.h> #include <linux/etherdevice.h> #include <linux/phy.h> struct fwnode_handle *__dev_fwnode(struct device *dev) { return IS_ENABLED(CONFIG_OF) && dev->of_node ? of_fwnode_handle(dev->of_node) : dev->fwnode; } EXPORT_SYMBOL_GPL(__dev_fwnode); const struct fwnode_handle *__dev_fwnode_const(const struct device *dev) { return IS_ENABLED(CONFIG_OF) && dev->of_node ? of_fwnode_handle(dev->of_node) : dev->fwnode; } EXPORT_SYMBOL_GPL(__dev_fwnode_const); /** * device_property_present - check if a property of a device is present * @dev: Device whose property is being checked * @propname: Name of the property * * Check if property @propname is present in the device firmware description. */ bool device_property_present(struct device *dev, const char *propname) { return fwnode_property_present(dev_fwnode(dev), propname); } EXPORT_SYMBOL_GPL(device_property_present); /** * fwnode_property_present - check if a property of a firmware node is present * @fwnode: Firmware node whose property to check * @propname: Name of the property */ bool fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname) { bool ret; if (IS_ERR_OR_NULL(fwnode)) return false; ret = fwnode_call_bool_op(fwnode, property_present, propname); if (ret) return ret; return fwnode_call_bool_op(fwnode->secondary, property_present, propname); } EXPORT_SYMBOL_GPL(fwnode_property_present); /** * device_property_read_u8_array - return a u8 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u8 properties with @propname from the device * firmware description and stores them to @val if found. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u8_array(struct device *dev, const char *propname, u8 *val, size_t nval) { return fwnode_property_read_u8_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u8_array); /** * device_property_read_u16_array - return a u16 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u16 properties with @propname from the device * firmware description and stores them to @val if found. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u16_array(struct device *dev, const char *propname, u16 *val, size_t nval) { return fwnode_property_read_u16_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u16_array); /** * device_property_read_u32_array - return a u32 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u32 properties with @propname from the device * firmware description and stores them to @val if found. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u32_array(struct device *dev, const char *propname, u32 *val, size_t nval) { return fwnode_property_read_u32_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u32_array); /** * device_property_read_u64_array - return a u64 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u64 properties with @propname from the device * firmware description and stores them to @val if found. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u64_array(struct device *dev, const char *propname, u64 *val, size_t nval) { return fwnode_property_read_u64_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u64_array); /** * device_property_read_string_array - return a string array property of device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of string properties with @propname from the device * firmware description and stores them to @val if found. * * Return: number of values read on success if @val is non-NULL, * number of values available on success if @val is NULL, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not an array of strings, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_string_array(struct device *dev, const char *propname, const char **val, size_t nval) { return fwnode_property_read_string_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_string_array); /** * device_property_read_string - return a string property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The value is stored here * * Function reads property @propname from the device firmware description and * stores the value into @val if found. The value is checked to be a string. * * Return: %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property type is not a string. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_string(struct device *dev, const char *propname, const char **val) { return fwnode_property_read_string(dev_fwnode(dev), propname, val); } EXPORT_SYMBOL_GPL(device_property_read_string); /** * device_property_match_string - find a string in an array and return index * @dev: Device to get the property of * @propname: Name of the property holding the array * @string: String to look for * * Find a given string in a string array and if it is found return the * index back. * * Return: %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of strings, * %-ENXIO if no suitable firmware interface is present. */ int device_property_match_string(struct device *dev, const char *propname, const char *string) { return fwnode_property_match_string(dev_fwnode(dev), propname, string); } EXPORT_SYMBOL_GPL(device_property_match_string); static int fwnode_property_read_int_array(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -EINVAL; ret = fwnode_call_int_op(fwnode, property_read_int_array, propname, elem_size, val, nval); if (ret != -EINVAL) return ret; return fwnode_call_int_op(fwnode->secondary, property_read_int_array, propname, elem_size, val, nval); } /** * fwnode_property_read_u8_array - return a u8 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u8 properties with @propname from @fwnode and stores them to * @val if found. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode, const char *propname, u8 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u8), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u8_array); /** * fwnode_property_read_u16_array - return a u16 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u16 properties with @propname from @fwnode and store them to * @val if found. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u16_array(const struct fwnode_handle *fwnode, const char *propname, u16 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u16), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u16_array); /** * fwnode_property_read_u32_array - return a u32 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u32 properties with @propname from @fwnode store them to * @val if found. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u32_array(const struct fwnode_handle *fwnode, const char *propname, u32 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u32), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u32_array); /** * fwnode_property_read_u64_array - return a u64 array property firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u64 properties with @propname from @fwnode and store them to * @val if found. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u64_array(const struct fwnode_handle *fwnode, const char *propname, u64 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u64), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u64_array); /** * fwnode_property_read_string_array - return string array property of a node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an string list property @propname from the given firmware node and store * them to @val if found. * * Return: number of values read on success if @val is non-NULL, * number of values available on success if @val is NULL, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not an array of strings, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_string_array(const struct fwnode_handle *fwnode, const char *propname, const char **val, size_t nval) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -EINVAL; ret = fwnode_call_int_op(fwnode, property_read_string_array, propname, val, nval); if (ret != -EINVAL) return ret; return fwnode_call_int_op(fwnode->secondary, property_read_string_array, propname, val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_string_array); /** * fwnode_property_read_string - return a string property of a firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The value is stored here * * Read property @propname from the given firmware node and store the value into * @val if found. The value is checked to be a string. * * Return: %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not a string, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_string(const struct fwnode_handle *fwnode, const char *propname, const char **val) { int ret = fwnode_property_read_string_array(fwnode, propname, val, 1); return ret < 0 ? ret : 0; } EXPORT_SYMBOL_GPL(fwnode_property_read_string); /** * fwnode_property_match_string - find a string in an array and return index * @fwnode: Firmware node to get the property of * @propname: Name of the property holding the array * @string: String to look for * * Find a given string in a string array and if it is found return the * index back. * * Return: %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of strings, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_match_string(const struct fwnode_handle *fwnode, const char *propname, const char *string) { const char **values; int nval, ret; nval = fwnode_property_read_string_array(fwnode, propname, NULL, 0); if (nval < 0) return nval; if (nval == 0) return -ENODATA; values = kcalloc(nval, sizeof(*values), GFP_KERNEL); if (!values) return -ENOMEM; ret = fwnode_property_read_string_array(fwnode, propname, values, nval); if (ret < 0) goto out; ret = match_string(values, nval, string); if (ret < 0) ret = -ENODATA; out: kfree(values); return ret; } EXPORT_SYMBOL_GPL(fwnode_property_match_string); /** * fwnode_property_get_reference_args() - Find a reference with arguments * @fwnode: Firmware node where to look for the reference * @prop: The name of the property * @nargs_prop: The name of the property telling the number of * arguments in the referred node. NULL if @nargs is known, * otherwise @nargs is ignored. Only relevant on OF. * @nargs: Number of arguments. Ignored if @nargs_prop is non-NULL. * @index: Index of the reference, from zero onwards. * @args: Result structure with reference and integer arguments. * * Obtain a reference based on a named property in an fwnode, with * integer arguments. * * Caller is responsible to call fwnode_handle_put() on the returned * args->fwnode pointer. * * Returns: %0 on success * %-ENOENT when the index is out of bounds, the index has an empty * reference or the property was not found * %-EINVAL on parse error */ int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, unsigned int nargs, unsigned int index, struct fwnode_reference_args *args) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -ENOENT; ret = fwnode_call_int_op(fwnode, get_reference_args, prop, nargs_prop, nargs, index, args); if (ret == 0) return ret; if (IS_ERR_OR_NULL(fwnode->secondary)) return ret; return fwnode_call_int_op(fwnode->secondary, get_reference_args, prop, nargs_prop, nargs, index, args); } EXPORT_SYMBOL_GPL(fwnode_property_get_reference_args); /** * fwnode_find_reference - Find named reference to a fwnode_handle * @fwnode: Firmware node where to look for the reference * @name: The name of the reference * @index: Index of the reference * * @index can be used when the named reference holds a table of references. * * Returns pointer to the reference fwnode, or ERR_PTR. Caller is responsible to * call fwnode_handle_put() on the returned fwnode pointer. */ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode, const char *name, unsigned int index) { struct fwnode_reference_args args; int ret; ret = fwnode_property_get_reference_args(fwnode, name, NULL, 0, index, &args); return ret ? ERR_PTR(ret) : args.fwnode; } EXPORT_SYMBOL_GPL(fwnode_find_reference); /** * device_remove_properties - Remove properties from a device object. * @dev: Device whose properties to remove. * * The function removes properties previously associated to the device * firmware node with device_add_properties(). Memory allocated to the * properties will also be released. */ void device_remove_properties(struct device *dev) { struct fwnode_handle *fwnode = dev_fwnode(dev); if (!fwnode) return; if (is_software_node(fwnode->secondary)) { fwnode_remove_software_node(fwnode->secondary); set_secondary_fwnode(dev, NULL); } } EXPORT_SYMBOL_GPL(device_remove_properties); /** * device_add_properties - Add a collection of properties to a device object. * @dev: Device to add properties to. * @properties: Collection of properties to add. * * Associate a collection of device properties represented by @properties with * @dev. The function takes a copy of @properties. * * WARNING: The callers should not use this function if it is known that there * is no real firmware node associated with @dev! In that case the callers * should create a software node and assign it to @dev directly. */ int device_add_properties(struct device *dev, const struct property_entry *properties) { struct fwnode_handle *fwnode; fwnode = fwnode_create_software_node(properties, NULL); if (IS_ERR(fwnode)) return PTR_ERR(fwnode); set_secondary_fwnode(dev, fwnode); return 0; } EXPORT_SYMBOL_GPL(device_add_properties); /** * fwnode_get_name - Return the name of a node * @fwnode: The firmware node * * Returns a pointer to the node name. */ const char *fwnode_get_name(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_name); } EXPORT_SYMBOL_GPL(fwnode_get_name); /** * fwnode_get_name_prefix - Return the prefix of node for printing purposes * @fwnode: The firmware node * * Returns the prefix of a node, intended to be printed right before the node. * The prefix works also as a separator between the nodes. */ const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_name_prefix); } /** * fwnode_get_parent - Return parent firwmare node * @fwnode: Firmware whose parent is retrieved * * Return parent firmware node of the given node if possible or %NULL if no * parent was available. */ struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_parent); } EXPORT_SYMBOL_GPL(fwnode_get_parent); /** * fwnode_get_next_parent - Iterate to the node's parent * @fwnode: Firmware whose parent is retrieved * * This is like fwnode_get_parent() except that it drops the refcount * on the passed node, making it suitable for iterating through a * node's parents. * * Returns a node pointer with refcount incremented, use * fwnode_handle_node() on it when done. */ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode) { struct fwnode_handle *parent = fwnode_get_parent(fwnode); fwnode_handle_put(fwnode); return parent; } EXPORT_SYMBOL_GPL(fwnode_get_next_parent); /** * fwnode_get_next_parent_dev - Find device of closest ancestor fwnode * @fwnode: firmware node * * Given a firmware node (@fwnode), this function finds its closest ancestor * firmware node that has a corresponding struct device and returns that struct * device. * * The caller of this function is expected to call put_device() on the returned * device when they are done. */ struct device *fwnode_get_next_parent_dev(struct fwnode_handle *fwnode) { struct device *dev; fwnode_handle_get(fwnode); do { fwnode = fwnode_get_next_parent(fwnode); if (!fwnode) return NULL; dev = get_dev_from_fwnode(fwnode); } while (!dev); fwnode_handle_put(fwnode); return dev; } /** * fwnode_count_parents - Return the number of parents a node has * @fwnode: The node the parents of which are to be counted * * Returns the number of parents a node has. */ unsigned int fwnode_count_parents(const struct fwnode_handle *fwnode) { struct fwnode_handle *__fwnode; unsigned int count; __fwnode = fwnode_get_parent(fwnode); for (count = 0; __fwnode; count++) __fwnode = fwnode_get_next_parent(__fwnode); return count; } EXPORT_SYMBOL_GPL(fwnode_count_parents); /** * fwnode_get_nth_parent - Return an nth parent of a node * @fwnode: The node the parent of which is requested * @depth: Distance of the parent from the node * * Returns the nth parent of a node. If there is no parent at the requested * @depth, %NULL is returned. If @depth is 0, the functionality is equivalent to * fwnode_handle_get(). For @depth == 1, it is fwnode_get_parent() and so on. * * The caller is responsible for calling fwnode_handle_put() for the returned * node. */ struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode, unsigned int depth) { fwnode_handle_get(fwnode); do { if (depth-- == 0) break; fwnode = fwnode_get_next_parent(fwnode); } while (fwnode); return fwnode; } EXPORT_SYMBOL_GPL(fwnode_get_nth_parent); /** * fwnode_is_ancestor_of - Test if @test_ancestor is ancestor of @test_child * @test_ancestor: Firmware which is tested for being an ancestor * @test_child: Firmware which is tested for being the child * * A node is considered an ancestor of itself too. * * Returns true if @test_ancestor is an ancestor of @test_child. * Otherwise, returns false. */ bool fwnode_is_ancestor_of(struct fwnode_handle *test_ancestor, struct fwnode_handle *test_child) { if (IS_ERR_OR_NULL(test_ancestor)) return false; fwnode_handle_get(test_child); do { if (test_child == test_ancestor) { fwnode_handle_put(test_child); return true; } test_child = fwnode_get_next_parent(test_child); } while (test_child); return false; } /** * fwnode_get_next_child_node - Return the next child node handle for a node * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the node's child nodes or a %NULL handle. */ struct fwnode_handle * fwnode_get_next_child_node(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { return fwnode_call_ptr_op(fwnode, get_next_child_node, child); } EXPORT_SYMBOL_GPL(fwnode_get_next_child_node); /** * fwnode_get_next_available_child_node - Return the next * available child node handle for a node * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the node's child nodes or a %NULL handle. */ struct fwnode_handle * fwnode_get_next_available_child_node(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { struct fwnode_handle *next_child = child; if (IS_ERR_OR_NULL(fwnode)) return NULL; do { next_child = fwnode_get_next_child_node(fwnode, next_child); if (!next_child) return NULL; } while (!fwnode_device_is_available(next_child)); return next_child; } EXPORT_SYMBOL_GPL(fwnode_get_next_available_child_node); /** * device_get_next_child_node - Return the next child node handle for a device * @dev: Device to find the next child node for. * @child: Handle to one of the device's child nodes or a null handle. */ struct fwnode_handle *device_get_next_child_node(struct device *dev, struct fwnode_handle *child) { const struct fwnode_handle *fwnode = dev_fwnode(dev); struct fwnode_handle *next; if (IS_ERR_OR_NULL(fwnode)) return NULL; /* Try to find a child in primary fwnode */ next = fwnode_get_next_child_node(fwnode, child); if (next) return next; /* When no more children in primary, continue with secondary */ return fwnode_get_next_child_node(fwnode->secondary, child); } EXPORT_SYMBOL_GPL(device_get_next_child_node); /** * fwnode_get_named_child_node - Return first matching named child node handle * @fwnode: Firmware node to find the named child node for. * @childname: String to match child node name against. */ struct fwnode_handle * fwnode_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname) { return fwnode_call_ptr_op(fwnode, get_named_child_node, childname); } EXPORT_SYMBOL_GPL(fwnode_get_named_child_node); /** * device_get_named_child_node - Return first matching named child node handle * @dev: Device to find the named child node for. * @childname: String to match child node name against. */ struct fwnode_handle *device_get_named_child_node(struct device *dev, const char *childname) { return fwnode_get_named_child_node(dev_fwnode(dev), childname); } EXPORT_SYMBOL_GPL(device_get_named_child_node); /** * fwnode_handle_get - Obtain a reference to a device node * @fwnode: Pointer to the device node to obtain the reference to. * * Returns the fwnode handle. */ struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode) { if (!fwnode_has_op(fwnode, get)) return fwnode; return fwnode_call_ptr_op(fwnode, get); } EXPORT_SYMBOL_GPL(fwnode_handle_get); /** * fwnode_handle_put - Drop reference to a device node * @fwnode: Pointer to the device node to drop the reference to. * * This has to be used when terminating device_for_each_child_node() iteration * with break or return to prevent stale device node references from being left * behind. */ void fwnode_handle_put(struct fwnode_handle *fwnode) { fwnode_call_void_op(fwnode, put); } EXPORT_SYMBOL_GPL(fwnode_handle_put); /** * fwnode_device_is_available - check if a device is available for use * @fwnode: Pointer to the fwnode of the device. * * For fwnode node types that don't implement the .device_is_available() * operation, this function returns true. */ bool fwnode_device_is_available(const struct fwnode_handle *fwnode) { if (IS_ERR_OR_NULL(fwnode)) return false; if (!fwnode_has_op(fwnode, device_is_available)) return true; return fwnode_call_bool_op(fwnode, device_is_available); } EXPORT_SYMBOL_GPL(fwnode_device_is_available); /** * device_get_child_node_count - return the number of child nodes for device * @dev: Device to cound the child nodes for */ unsigned int device_get_child_node_count(struct device *dev) { struct fwnode_handle *child; unsigned int count = 0; device_for_each_child_node(dev, child) count++; return count; } EXPORT_SYMBOL_GPL(device_get_child_node_count); bool device_dma_supported(struct device *dev) { const struct fwnode_handle *fwnode = dev_fwnode(dev); /* For DT, this is always supported. * For ACPI, this depends on CCA, which * is determined by the acpi_dma_supported(). */ if (is_of_node(fwnode)) return true; return acpi_dma_supported(to_acpi_device_node(fwnode)); } EXPORT_SYMBOL_GPL(device_dma_supported); enum dev_dma_attr device_get_dma_attr(struct device *dev) { const struct fwnode_handle *fwnode = dev_fwnode(dev); enum dev_dma_attr attr = DEV_DMA_NOT_SUPPORTED; if (is_of_node(fwnode)) { if (of_dma_is_coherent(to_of_node(fwnode))) attr = DEV_DMA_COHERENT; else attr = DEV_DMA_NON_COHERENT; } else attr = acpi_get_dma_attr(to_acpi_device_node(fwnode)); return attr; } EXPORT_SYMBOL_GPL(device_get_dma_attr); /** * fwnode_get_phy_mode - Get phy mode for given firmware node * @fwnode: Pointer to the given node * * The function gets phy interface string from property 'phy-mode' or * 'phy-connection-type', and return its index in phy_modes table, or errno in * error case. */ int fwnode_get_phy_mode(struct fwnode_handle *fwnode) { const char *pm; int err, i; err = fwnode_property_read_string(fwnode, "phy-mode", &pm); if (err < 0) err = fwnode_property_read_string(fwnode, "phy-connection-type", &pm); if (err < 0) return err; for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) if (!strcasecmp(pm, phy_modes(i))) return i; return -ENODEV; } EXPORT_SYMBOL_GPL(fwnode_get_phy_mode); /** * device_get_phy_mode - Get phy mode for given device * @dev: Pointer to the given device * * The function gets phy interface string from property 'phy-mode' or * 'phy-connection-type', and return its index in phy_modes table, or errno in * error case. */ int device_get_phy_mode(struct device *dev) { return fwnode_get_phy_mode(dev_fwnode(dev)); } EXPORT_SYMBOL_GPL(device_get_phy_mode); static void *fwnode_get_mac_addr(struct fwnode_handle *fwnode, const char *name, char *addr, int alen) { int ret = fwnode_property_read_u8_array(fwnode, name, addr, alen); if (ret == 0 && alen == ETH_ALEN && is_valid_ether_addr(addr)) return addr; return NULL; } /** * fwnode_get_mac_address - Get the MAC from the firmware node * @fwnode: Pointer to the firmware node * @addr: Address of buffer to store the MAC in * @alen: Length of the buffer pointed to by addr, should be ETH_ALEN * * Search the firmware node for the best MAC address to use. 'mac-address' is * checked first, because that is supposed to contain to "most recent" MAC * address. If that isn't set, then 'local-mac-address' is checked next, * because that is the default address. If that isn't set, then the obsolete * 'address' is checked, just in case we're using an old device tree. * * Note that the 'address' property is supposed to contain a virtual address of * the register set, but some DTS files have redefined that property to be the * MAC address. * * All-zero MAC addresses are rejected, because those could be properties that * exist in the firmware tables, but were not updated by the firmware. For * example, the DTS could define 'mac-address' and 'local-mac-address', with * zero MAC addresses. Some older U-Boots only initialized 'local-mac-address'. * In this case, the real MAC is in 'local-mac-address', and 'mac-address' * exists but is all zeros. */ void *fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr, int alen) { char *res; res = fwnode_get_mac_addr(fwnode, "mac-address", addr, alen); if (res) return res; res = fwnode_get_mac_addr(fwnode, "local-mac-address", addr, alen); if (res) return res; return fwnode_get_mac_addr(fwnode, "address", addr, alen); } EXPORT_SYMBOL(fwnode_get_mac_address); /** * device_get_mac_address - Get the MAC for a given device * @dev: Pointer to the device * @addr: Address of buffer to store the MAC in * @alen: Length of the buffer pointed to by addr, should be ETH_ALEN */ void *device_get_mac_address(struct device *dev, char *addr, int alen) { return fwnode_get_mac_address(dev_fwnode(dev), addr, alen); } EXPORT_SYMBOL(device_get_mac_address); /** * fwnode_irq_get - Get IRQ directly from a fwnode * @fwnode: Pointer to the firmware node * @index: Zero-based index of the IRQ * * Returns Linux IRQ number on success. Other values are determined * accordingly to acpi_/of_ irq_get() operation. */ int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index) { struct resource res; int ret; if (is_of_node(fwnode)) return of_irq_get(to_of_node(fwnode), index); ret = acpi_irq_get(ACPI_HANDLE_FWNODE(fwnode), index, &res); if (ret) return ret; return res.start; } EXPORT_SYMBOL(fwnode_irq_get); /** * fwnode_iomap - Maps the memory mapped IO for a given fwnode * @fwnode: Pointer to the firmware node * @index: Index of the IO range * * Returns a pointer to the mapped memory. */ void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index) { if (IS_ENABLED(CONFIG_OF_ADDRESS) && is_of_node(fwnode)) return of_iomap(to_of_node(fwnode), index); return NULL; } EXPORT_SYMBOL(fwnode_iomap); /** * fwnode_irq_get_byname - Get IRQ from a fwnode using its name * @fwnode: Pointer to the firmware node * @name: IRQ name * * Description: * Find a match to the string @name in the 'interrupt-names' string array * in _DSD for ACPI, or of_node for Device Tree. Then get the Linux IRQ * number of the IRQ resource corresponding to the index of the matched * string. * * Return: * Linux IRQ number on success, or negative errno otherwise. */ int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name) { int index; if (!name) return -EINVAL; index = fwnode_property_match_string(fwnode, "interrupt-names", name); if (index < 0) return index; return fwnode_irq_get(fwnode, index); } EXPORT_SYMBOL(fwnode_irq_get_byname); /** * fwnode_graph_get_next_endpoint - Get next endpoint firmware node * @fwnode: Pointer to the parent firmware node * @prev: Previous endpoint node or %NULL to get the first * * Returns an endpoint firmware node pointer or %NULL if no more endpoints * are available. */ struct fwnode_handle * fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { struct fwnode_handle *ep, *port_parent = NULL; const struct fwnode_handle *parent; /* * If this function is in a loop and the previous iteration returned * an endpoint from fwnode->secondary, then we need to use the secondary * as parent rather than @fwnode. */ if (prev) { port_parent = fwnode_graph_get_port_parent(prev); parent = port_parent; } else { parent = fwnode; } if (IS_ERR_OR_NULL(parent)) return NULL; ep = fwnode_call_ptr_op(parent, graph_get_next_endpoint, prev); if (ep) goto out_put_port_parent; ep = fwnode_graph_get_next_endpoint(parent->secondary, NULL); out_put_port_parent: fwnode_handle_put(port_parent); return ep; } EXPORT_SYMBOL_GPL(fwnode_graph_get_next_endpoint); /** * fwnode_graph_get_port_parent - Return the device fwnode of a port endpoint * @endpoint: Endpoint firmware node of the port * * Return: the firmware node of the device the @endpoint belongs to. */ struct fwnode_handle * fwnode_graph_get_port_parent(const struct fwnode_handle *endpoint) { struct fwnode_handle *port, *parent; port = fwnode_get_parent(endpoint); parent = fwnode_call_ptr_op(port, graph_get_port_parent); fwnode_handle_put(port); return parent; } EXPORT_SYMBOL_GPL(fwnode_graph_get_port_parent); /** * fwnode_graph_get_remote_port_parent - Return fwnode of a remote device * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote device the @fwnode points to. */ struct fwnode_handle * fwnode_graph_get_remote_port_parent(const struct fwnode_handle *fwnode) { struct fwnode_handle *endpoint, *parent; endpoint = fwnode_graph_get_remote_endpoint(fwnode); parent = fwnode_graph_get_port_parent(endpoint); fwnode_handle_put(endpoint); return parent; } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port_parent); /** * fwnode_graph_get_remote_port - Return fwnode of a remote port * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote port the @fwnode points to. */ struct fwnode_handle * fwnode_graph_get_remote_port(const struct fwnode_handle *fwnode) { return fwnode_get_next_parent(fwnode_graph_get_remote_endpoint(fwnode)); } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port); /** * fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote endpoint the @fwnode points to. */ struct fwnode_handle * fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, graph_get_remote_endpoint); } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint); /** * fwnode_graph_get_remote_node - get remote parent node for given port/endpoint * @fwnode: pointer to parent fwnode_handle containing graph port/endpoint * @port_id: identifier of the parent port node * @endpoint_id: identifier of the endpoint node * * Return: Remote fwnode handle associated with remote endpoint node linked * to @node. Use fwnode_node_put() on it when done. */ struct fwnode_handle * fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port_id, u32 endpoint_id) { struct fwnode_handle *endpoint = NULL; while ((endpoint = fwnode_graph_get_next_endpoint(fwnode, endpoint))) { struct fwnode_endpoint fwnode_ep; struct fwnode_handle *remote; int ret; ret = fwnode_graph_parse_endpoint(endpoint, &fwnode_ep); if (ret < 0) continue; if (fwnode_ep.port != port_id || fwnode_ep.id != endpoint_id) continue; remote = fwnode_graph_get_remote_port_parent(endpoint); if (!remote) return NULL; return fwnode_device_is_available(remote) ? remote : NULL; } return NULL; } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_node); /** * fwnode_graph_get_endpoint_by_id - get endpoint by port and endpoint numbers * @fwnode: parent fwnode_handle containing the graph * @port: identifier of the port node * @endpoint: identifier of the endpoint node under the port node * @flags: fwnode lookup flags * * Return the fwnode handle of the local endpoint corresponding the port and * endpoint IDs or NULL if not found. * * If FWNODE_GRAPH_ENDPOINT_NEXT is passed in @flags and the specified endpoint * has not been found, look for the closest endpoint ID greater than the * specified one and return the endpoint that corresponds to it, if present. * * Do not return endpoints that belong to disabled devices, unless * FWNODE_GRAPH_DEVICE_DISABLED is passed in @flags. * * The returned endpoint needs to be released by calling fwnode_handle_put() on * it when it is not needed any more. */ struct fwnode_handle * fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode, u32 port, u32 endpoint, unsigned long flags) { struct fwnode_handle *ep = NULL, *best_ep = NULL; unsigned int best_ep_id = 0; bool endpoint_next = flags & FWNODE_GRAPH_ENDPOINT_NEXT; bool enabled_only = !(flags & FWNODE_GRAPH_DEVICE_DISABLED); while ((ep = fwnode_graph_get_next_endpoint(fwnode, ep))) { struct fwnode_endpoint fwnode_ep = { 0 }; int ret; if (enabled_only) { struct fwnode_handle *dev_node; bool available; dev_node = fwnode_graph_get_remote_port_parent(ep); available = fwnode_device_is_available(dev_node); fwnode_handle_put(dev_node); if (!available) continue; } ret = fwnode_graph_parse_endpoint(ep, &fwnode_ep); if (ret < 0) continue; if (fwnode_ep.port != port) continue; if (fwnode_ep.id == endpoint) return ep; if (!endpoint_next) continue; /* * If the endpoint that has just been found is not the first * matching one and the ID of the one found previously is closer * to the requested endpoint ID, skip it. */ if (fwnode_ep.id < endpoint || (best_ep && best_ep_id < fwnode_ep.id)) continue; fwnode_handle_put(best_ep); best_ep = fwnode_handle_get(ep); best_ep_id = fwnode_ep.id; } return best_ep; } EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_by_id); /** * fwnode_graph_parse_endpoint - parse common endpoint node properties * @fwnode: pointer to endpoint fwnode_handle * @endpoint: pointer to the fwnode endpoint data structure * * Parse @fwnode representing a graph endpoint node and store the * information in @endpoint. The caller must hold a reference to * @fwnode. */ int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint) { memset(endpoint, 0, sizeof(*endpoint)); return fwnode_call_int_op(fwnode, graph_parse_endpoint, endpoint); } EXPORT_SYMBOL(fwnode_graph_parse_endpoint); const void *device_get_match_data(const struct device *dev) { return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data, dev); } EXPORT_SYMBOL_GPL(device_get_match_data); static void * fwnode_graph_devcon_match(struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match) { struct fwnode_handle *node; struct fwnode_handle *ep; void *ret; fwnode_graph_for_each_endpoint(fwnode, ep) { node = fwnode_graph_get_remote_port_parent(ep); if (!fwnode_device_is_available(node)) { fwnode_handle_put(node); continue; } ret = match(node, con_id, data); fwnode_handle_put(node); if (ret) { fwnode_handle_put(ep); return ret; } } return NULL; } static void * fwnode_devcon_match(struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match) { struct fwnode_handle *node; void *ret; int i; for (i = 0; ; i++) { node = fwnode_find_reference(fwnode, con_id, i); if (IS_ERR(node)) break; ret = match(node, NULL, data); fwnode_handle_put(node); if (ret) return ret; } return NULL; } /** * fwnode_connection_find_match - Find connection from a device node * @fwnode: Device node with the connection * @con_id: Identifier for the connection * @data: Data for the match function * @match: Function to check and convert the connection description * * Find a connection with unique identifier @con_id between @fwnode and another * device node. @match will be used to convert the connection description to * data the caller is expecting to be returned. */ void *fwnode_connection_find_match(struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match) { void *ret; if (!fwnode || !match) return NULL; ret = fwnode_graph_devcon_match(fwnode, con_id, data, match); if (ret) return ret; return fwnode_devcon_match(fwnode, con_id, data, match); } EXPORT_SYMBOL_GPL(fwnode_connection_find_match); |
50 50 255 257 257 255 566 1 1 1 1 1 7 7 6 63 228 228 226 228 13 13 13 7 7 7 5 7 5 5 5 5 5 5 5 7 7 7 7 7 6 7 1 13 13 4 9 9 9 4 7 7 9 13 13 13 142 141 86 1 19 65 62 62 62 62 86 86 85 86 13 13 460 51 51 369 372 256 258 256 6 6 202 202 109 169 433 113 113 113 113 110 110 458 252 250 249 80 203 80 80 243 459 458 460 255 287 45 45 45 1 1 6 8 8 8 7 1 6 6 6 6 6 6 6 1 3 3 3 8 8 466 37 418 61 2 177 142 34 208 322 79 208 212 212 460 460 315 99 59 14 143 456 602 598 601 613 476 3 64 197 11 119 38 214 436 14 117 15 4 11 58 3 57 13 39 251 13 38 30 100 191 204 148 469 554 259 6 401 7 7 2 5 7 7 7 7 7 2 5 7 7 7 7 7 7 5 2 2 7 3 3 3 3 3 3 3 20 20 20 20 20 20 3 20 1 24 25 25 24 22 22 22 22 1 21 16 6 19 2 19 2 21 21 19 2 19 2 21 2 5 5 10 10 10 8 5 8 5 6 7 7 17 279 50 50 50 50 50 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Alan Cox : Verify area fixes. * Alan Cox : cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox : Use __u32 properly * Alan Cox : Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Split to fib.c and * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Vitaly E. Lavrov : Race condition in ip_route_input_slow. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. * Vladimir V. Ivanov : IP rule info (flowid) is really useful. * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect * Ilia Sotnikov : Removed TOS from hash calculations */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/memblock.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/times.h> #include <linux/slab.h> #include <linux/jhash.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> #include <net/route.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/arp.h> #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/secure_seq.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> #include "fib_lookup.h" #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) #define RT_GC_TIMEOUT (300*HZ) #define DEFAULT_MIN_PMTU (512 + 20 + 20) #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ) static int ip_rt_max_size; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost __read_mostly = HZ; static int ip_rt_error_burst __read_mostly = 5 * HZ; static int ip_rt_min_advmss __read_mostly = 256; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { WARN_ON(1); return NULL; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, .confirm_neigh = ipv4_confirm_neigh, }; #define ECN_OR_COST(class) TC_PRIO_##class const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos) return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { } static int rt_cache_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); return 0; } static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, .show = rt_cache_seq_show, }; static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) { int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } return NULL; } static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } (*pos)++; return NULL; } static void rt_cpu_seq_stop(struct seq_file *seq, void *v) { } static int rt_cpu_seq_show(struct seq_file *seq, void *v) { struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x\n", dst_entries_get_slow(&ipv4_dst_ops), 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, st->in_brd, st->in_martian_dst, st->in_martian_src, 0, /* st->out_hit */ st->out_slow_tot, st->out_slow_mc, 0, /* st->gc_total */ 0, /* st->gc_ignored */ 0, /* st->gc_goal_miss */ 0, /* st->gc_dst_overflow */ 0, /* st->in_hlist_search */ 0 /* st->out_hlist_search */ ); return 0; } static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, .show = rt_cpu_seq_show, }; #ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; unsigned int i, j; dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); if (!dst) return -ENOMEM; for_each_possible_cpu(i) { src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); for (j = 0; j < 256; j++) { dst[j].o_bytes += src[j].o_bytes; dst[j].o_packets += src[j].o_packets; dst[j].i_bytes += src[j].i_bytes; dst[j].i_packets += src[j].i_packets; } } seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); kfree(dst); return 0; } #endif static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; pde = proc_create_seq("rt_cache", 0444, net->proc_net, &rt_cache_seq_ops); if (!pde) goto err1; pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat, &rt_cpu_seq_ops); if (!pde) goto err2; #ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create_single("rt_acct", 0, net->proc_net, rt_acct_proc_show); if (!pde) goto err3; #endif return 0; #ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif err2: remove_proc_entry("rt_cache", net->proc_net); err1: return -ENOMEM; } static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); #ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { .init = ip_rt_do_proc_init, .exit = ip_rt_do_proc_exit, }; static int __init ip_rt_proc_init(void) { return register_pernet_subsys(&ip_rt_proc_ops); } #else static inline int ip_rt_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ static inline bool rt_is_expired(const struct rtable *rth) { bool res; rcu_read_lock(); res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); rcu_read_unlock(); return res; } void rt_cache_flush(struct net *net) { rt_genid_bump_ipv4(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; struct neighbour *n; rcu_read_lock_bh(); if (likely(rt->rt_gw_family == AF_INET)) { n = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { n = ip_neigh_gw6(dev, &rt->rt_gw6); } else { __be32 pkey; pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); n = ip_neigh_gw4(dev, pkey); } if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock_bh(); return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; const __be32 *pkey = daddr; if (rt->rt_gw_family == AF_INET) { pkey = (const __be32 *)&rt->rt_gw4; } else if (rt->rt_gw_family == AF_INET6) { return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); } else if (!daddr || (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } /* Hash tables of size 2048..262144 depending on RAM size. * Each bucket uses 8 bytes. */ static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker * to infer how many packets were sent between two points in time. */ u32 ip_idents_reserve(u32 hash, int segs) { u32 bucket, old, now = (u32)jiffies; atomic_t *p_id; u32 *p_tstamp; u32 delta = 0; bucket = hash & ip_idents_mask; p_tstamp = ip_tstamps + bucket; p_id = ip_idents + bucket; old = READ_ONCE(*p_tstamp); if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); /* If UBSAN reports an error there, please make sure your compiler * supports -fno-strict-overflow before reporting it that was a bug * in UBSAN, and it has been fixed in GCC-8. */ return atomic_add_return(segs + delta, p_id) - segs; } EXPORT_SYMBOL(ip_idents_reserve); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) { u32 hash, id; /* Note the following code is not safe, but this is okay. */ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) get_random_bytes(&net->ipv4.ip_id_key, sizeof(net->ipv4.ip_id_key)); hash = siphash_3u32((__force u32)iph->daddr, (__force u32)iph->saddr, iph->protocol, &net->ipv4.ip_id_key); id = ip_idents_reserve(hash, segs); iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); static void ip_rt_fix_tos(struct flowi4 *fl4) { __u8 tos = RT_FL_TOS(fl4); fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = tos & RTO_ONLINK ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; } static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, u8 tos, u8 prot, u32 mark, int flow_flags) { if (sk) { const struct inet_sock *inet = inet_sk(sk); oif = sk->sk_bound_dev_if; mark = sk->sk_mark; tos = RT_CONN_FLAGS(sk); prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; } flowi4_init_output(fl4, oif, mark, tos, RT_SCOPE_UNIVERSE, prot, flow_flags, iph->daddr, iph->saddr, 0, 0, sock_net_uid(net, sk)); } static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, const struct sock *sk) { const struct net *net = dev_net(skb->dev); const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; u8 tos = RT_TOS(iph->tos); u8 prot = iph->protocol; u32 mark = skb->mark; __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, 0, 0, sk->sk_uid); rcu_read_unlock(); } static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct sk_buff *skb) { if (skb) build_skb_flow_key(fl4, skb, sk); else build_sk_flow_key(fl4, sk); } static DEFINE_SPINLOCK(fnhe_lock); static void fnhe_flush_routes(struct fib_nh_exception *fnhe) { struct rtable *rt; rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } } static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { struct fib_nh_exception __rcu **fnhe_p, **oldest_p; struct fib_nh_exception *fnhe, *oldest = NULL; for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); if (!fnhe) break; if (!oldest || time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; oldest_p = fnhe_p; } } fnhe_flush_routes(oldest); *oldest_p = oldest->fnhe_next; kfree_rcu(oldest, rcu); } static u32 fnhe_hashfun(__be32 daddr) { static siphash_key_t fnhe_hash_key __read_mostly; u64 hval; net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key)); hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key); return hash_64(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) { rt->rt_pmtu = fnhe->fnhe_pmtu; rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; rt->dst.expires = fnhe->fnhe_expires; if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } } static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, __be32 gw, u32 pmtu, bool lock, unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; u32 genid, hval; unsigned int i; int depth; genid = fnhe_genid(dev_net(nhc->nhc_dev)); hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference(nhc->nhc_exceptions); if (!hash) { hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; rcu_assign_pointer(nhc->nhc_exceptions, hash); } hash += hval; depth = 0; for (fnhe = rcu_dereference(hash->chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) break; depth++; } if (fnhe) { if (fnhe->fnhe_genid != genid) fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; if (pmtu) { fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; } fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) fill_route_from_fnhe(rt, fnhe); rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) fill_route_from_fnhe(rt, fnhe); } else { /* Randomize max depth to avoid some side channels attacks. */ int max_depth = FNHE_RECLAIM_DEPTH + prandom_u32_max(FNHE_RECLAIM_DEPTH); while (depth > max_depth) { fnhe_remove_oldest(hash); depth--; } fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); if (!fnhe) goto out_unlock; fnhe->fnhe_next = hash->chain; fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = max(1UL, expires); rcu_assign_pointer(hash->chain, fnhe); /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. */ rt = rcu_dereference(nhc->nhc_rth_input); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; for_each_possible_cpu(i) { struct rtable __rcu **prt; prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; } } fnhe->fnhe_stamp = jiffies; out_unlock: spin_unlock_bh(&fnhe_lock); } static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, bool kill_route) { __be32 new_gw = icmp_hdr(skb)->un.gateway; __be32 old_gw = ip_hdr(skb)->saddr; struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct neighbour *n; struct net *net; switch (icmp_hdr(skb)->code & 7) { case ICMP_REDIR_NET: case ICMP_REDIR_NETTOS: case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: break; default: return; } if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; net = dev_net(dev); if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw); if (!n) n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, skb); nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } neigh_release(n); } return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) { const struct iphdr *iph = (const struct iphdr *) skb->data; __be32 daddr = iph->daddr; __be32 saddr = iph->saddr; net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" " Advised path = %pI4 -> %pI4\n", &old_gw, dev->name, &new_gw, &saddr, &daddr); } #endif ; } static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct rtable *rt; struct flowi4 fl4; const struct iphdr *iph = (const struct iphdr *) skb->data; struct net *net = dev_net(skb->dev); int oif = skb->dev->ifindex; u8 tos = RT_TOS(iph->tos); u8 prot = iph->protocol; u32 mark = skb->mark; rt = (struct rtable *) dst; __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); ip_rt_fix_tos(&fl4); __ip_do_redirect(rt, skb, &fl4, true); } static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst) { struct rtable *rt = (struct rtable *)dst; if ((dst->obsolete > 0) || (rt->rt_flags & RTCF_REDIRECTED) || rt->dst.expires) sk_dst_reset(sk); } /* * Algorithm: * 1. The first ip_rt_redirect_number redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. * 2. If we did not see packets requiring redirects * during ip_rt_redirect_silence, we assume that the host * forgot redirected route and start to send redirects again. * * This algorithm is much cheaper and more intelligent than dumb load limiting * in icmp.c. * * NOTE. Do not forget to inhibit load limiting for redirects (redundant) * and "frag. need" (breaks PMTU discovery) in icmp.c. */ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; struct inet_peer *peer; struct net *net; int log_martians; int vif; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); rcu_read_unlock(); net = dev_net(rt->dst.dev); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; } /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { peer->rate_tokens = 0; peer->n_redirects = 0; } /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; goto out_put_peer; } /* Check for load limit; set rate_last to the latest sent * redirect. */ if (peer->n_redirects == 0 || time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->n_redirects)))) { __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->n_redirects; if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians && peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } out_put_peer: inet_putpeer(peer); } static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = skb->dev; struct in_device *in_dev; struct inet_peer *peer; unsigned long now; struct net *net; bool send; int code; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) goto out; } in_dev = __in_dev_get_rcu(dev); /* IP on this device is disabled. */ if (!in_dev) goto out; net = dev_net(rt->dst.dev); if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { case EHOSTUNREACH: __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); break; case ENETUNREACH: __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; } goto out; } switch (rt->dst.error) { case EINVAL: default: goto out; case EHOSTUNREACH: code = ICMP_HOST_UNREACH; break; case ENETUNREACH: code = ICMP_NET_UNREACH; __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; break; } peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, l3mdev_master_ifindex(skb->dev), 1); send = true; if (peer) { now = jiffies; peer->rate_tokens += now - peer->rate_last; if (peer->rate_tokens > ip_rt_error_burst) peer->rate_tokens = ip_rt_error_burst; peer->rate_last = now; if (peer->rate_tokens >= ip_rt_error_cost) peer->rate_tokens -= ip_rt_error_cost; else send = false; inet_putpeer(peer); } if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb(skb); return 0; } static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct fib_result res; bool lock = false; struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) return; old_mtu = ipv4_mtu(dst); if (old_mtu < mtu) return; rcu_read_lock(); net = dev_net_rcu(dst->dev); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); } if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) goto out; if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, NULL); #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fib_info_num_path(res.fi) > 1) { int nhsel; for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) { nhc = fib_info_nhc(res.fi, nhsel); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } goto out; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } out: rcu_read_unlock(); } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct rtable *rt = (struct rtable *) dst; struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); ip_rt_fix_tos(&fl4); /* Don't make lookup fail for bridged encapsulations */ if (skb && netif_is_any_bridge_port(skb->dev)) fl4.flowi4_oif = 0; __ip_rt_update_pmtu(rt, &fl4, mtu); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; u32 mark = IP4_REPLY_MARK(net, skb->mark); __build_flow_key(net, &fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_update_pmtu); static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); if (!fl4.flowi4_mark) fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct dst_entry *odst = NULL; bool new = false; struct net *net = sock_net(sk); bh_lock_sock(sk); if (!ip_sk_accept_pmtu(sk)) goto out; odst = sk_dst_get(sk); if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = (struct rtable *)odst; if (odst->obsolete && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } else { ip_rt_fix_tos(&fl4); } __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } if (new) sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(net, &fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct net *net = sock_net(sk); __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = (struct rtable *) dst; /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. * * When a PMTU/redirect information update invalidates a route, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or * DST_OBSOLETE_DEAD. */ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) return NULL; return dst; } EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { struct net_device *dev; struct ip_options opt; int res; /* Recompile ip options since IPCB may not be valid anymore. * Also check we have a reasonable ipv4 header. */ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; memset(&opt, 0, sizeof(opt)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); rcu_read_unlock(); if (res) return; } __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); } static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; ipv4_send_dest_unreach(skb); rt = skb_rtable(skb); if (rt) dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); WARN_ON(1); return 0; } /* * We do not cache source address of outgoing interface, * because it is used only by IP RR, TS and SRR options, * so that it out of fast path. * * BTW remember: "addr" is allowed to be not aligned * in IP options! */ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; if (rt_is_output_route(rt)) src = ip_hdr(skb)->saddr; else { struct fib_result res; struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = iph->tos & IPTOS_RT_MASK, .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); else src = inet_select_addr(rt->dst.dev, rt_nexthop(rt, iph->daddr), RT_SCOPE_UNIVERSE); rcu_read_unlock(); } memcpy(addr, &src, 4); } #ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) rt->dst.tclassid |= tag & 0xFFFF; if (!(rt->dst.tclassid & 0xFFFF0000)) rt->dst.tclassid |= tag & 0xFFFF0000; } #endif static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, ip_rt_min_advmss); return min(advmss, IPV4_MAX_PMTU - header_size); } INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { return ip_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ipv4_mtu); static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe, __rcu **fnhe_p; u32 hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock)); hash += hval; fnhe_p = &hash->chain; fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); while (fnhe) { if (fnhe->fnhe_daddr == daddr) { rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); /* set fnhe_daddr to 0 to ensure it won't bind with * new dsts in rt_bind_exception(). */ fnhe->fnhe_daddr = 0; fnhe_flush_routes(fnhe); kfree_rcu(fnhe, rcu); break; } fnhe_p = &fnhe->fnhe_next; fnhe = rcu_dereference_protected(fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)); } spin_unlock_bh(&fnhe_lock); } static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); struct fib_nh_exception *fnhe; u32 hval; if (!hash) return NULL; hval = fnhe_hashfun(daddr); for (fnhe = rcu_dereference(hash[hval].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) { if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) { ip_del_fnhe(nhc, daddr); break; } return fnhe; } } return NULL; } /* MTU selection: * 1. mtu on route is locked - use it * 2. mtu from nexthop exception * 3. mtu from egress device */ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) { struct fib_nh_common *nhc = res->nhc; struct net_device *dev = nhc->nhc_dev; struct fib_info *fi = res->fi; u32 mtu = 0; if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) || fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) mtu = fi->fib_mtu; if (likely(!mtu)) { struct fib_nh_exception *fnhe; fnhe = find_exception(nhc, daddr); if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) mtu = fnhe->fnhe_pmtu; } if (likely(!mtu)) mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr, const bool do_cache) { bool ret = false; spin_lock_bh(&fnhe_lock); if (daddr == fnhe->fnhe_daddr) { struct rtable __rcu **porig; struct rtable *orig; int genid = fnhe_genid(dev_net(rt->dst.dev)); if (rt_is_input_route(rt)) porig = &fnhe->fnhe_rth_input; else porig = &fnhe->fnhe_rth_output; orig = rcu_dereference(*porig); if (fnhe->fnhe_genid != genid) { fnhe->fnhe_genid = genid; fnhe->fnhe_gw = 0; fnhe->fnhe_pmtu = 0; fnhe->fnhe_expires = 0; fnhe->fnhe_mtu_locked = false; fnhe_flush_routes(fnhe); orig = NULL; } fill_route_from_fnhe(rt, fnhe); if (!rt->rt_gw4) { rt->rt_gw4 = daddr; rt->rt_gw_family = AF_INET; } if (do_cache) { dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); if (orig) { dst_dev_put(&orig->dst); dst_release(&orig->dst); } ret = true; } fnhe->fnhe_stamp = jiffies; } spin_unlock_bh(&fnhe_lock); return ret; } static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) { struct rtable *orig, *prev, **p; bool ret = true; if (rt_is_input_route(rt)) { p = (struct rtable **)&nhc->nhc_rth_input; } else { p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } orig = *p; /* hold dst before doing cmpxchg() to avoid race condition * on this dst */ dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { dst_release(&rt->dst); ret = false; } return ret; } struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); void rt_add_uncached_list(struct rtable *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); rt->rt_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt_del_uncached_list(struct rtable *rt) { if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; spin_lock_bh(&ul->lock); list_del(&rt->rt_uncached); spin_unlock_bh(&ul->lock); } } static void ipv4_dst_destroy(struct dst_entry *dst) { struct rtable *rt = (struct rtable *)dst; ip_dst_metrics_put(dst); rt_del_uncached_list(rt); } void rt_flush_dev(struct net_device *dev) { struct rtable *rt; int cpu; for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); spin_lock_bh(&ul->lock); list_for_each_entry(rt, &ul->head, rt_uncached) { if (rt->dst.dev != dev) continue; rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(dev); } spin_unlock_bh(&ul->lock); } } static bool rt_cache_valid(const struct rtable *rt) { return rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && !rt_is_expired(rt); } static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag, const bool do_cache) { bool cached = false; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) rt->rt_gw4 = nhc->nhc_gw.ipv4; else rt->rt_gw6 = nhc->nhc_gw.ipv6; } ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); rt->dst.tclassid = nh->nh_tclassid; } #endif rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr, do_cache); else if (do_cache) cached = rt_cache_route(nhc, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or * FIB nexthop have the DST_NOCACHE bit clear. * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ if (!rt->rt_gw4) { rt->rt_gw_family = AF_INET; rt->rt_gw4 = daddr; } rt_add_uncached_list(rt); } } else rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif } struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, bool nopolicy, bool noxfrm) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); if (rt) { rt->rt_genid = rt_genid_ipv4(dev_net(dev)); rt->rt_flags = flags; rt->rt_type = type; rt->rt_is_input = 0; rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; if (flags & RTCF_LOCAL) rt->dst.input = ip_local_deliver; } return rt; } EXPORT_SYMBOL(rt_dst_alloc); struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) { struct rtable *new_rt; new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, rt->dst.flags); if (new_rt) { new_rt->rt_genid = rt_genid_ipv4(dev_net(dev)); new_rt->rt_flags = rt->rt_flags; new_rt->rt_type = rt->rt_type; new_rt->rt_is_input = rt->rt_is_input; new_rt->rt_iif = rt->rt_iif; new_rt->rt_pmtu = rt->rt_pmtu; new_rt->rt_mtu_locked = rt->rt_mtu_locked; new_rt->rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) new_rt->rt_gw4 = rt->rt_gw4; else if (rt->rt_gw_family == AF_INET6) new_rt->rt_gw6 = rt->rt_gw6; INIT_LIST_HEAD(&new_rt->rt_uncached); new_rt->dst.input = rt->dst.input; new_rt->dst.output = rt->dst.output; new_rt->dst.error = rt->dst.error; new_rt->dst.lastuse = jiffies; new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); } return new_rt; } EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, struct in_device *in_dev, u32 *itag) { int err; /* Primary sanity checks. */ if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) return -EINVAL; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) return -EINVAL; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr) && ip_hdr(skb)->protocol != IPPROTO_IGMP) return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, in_dev, itag); if (err < 0) return err; } return 0; } /* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; struct rtable *rth; bool no_policy; u32 itag = 0; int err; err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); if (err) return err; if (our) flags |= RTCF_LOCAL; no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); if (no_policy) IPCB(skb)->flags |= IPSKB_NOPOLICY; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, no_policy, false); if (!rth) return -ENOBUFS; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; rth->rt_is_input= 1; #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); return 0; } static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, __be32 daddr, __be32 saddr) { RT_CACHE_STAT_INC(in_martian_src); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { /* * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ pr_warn("martian source %pI4 from %pI4, on dev %s\n", &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), dev->hard_header_len, false); } } #endif } /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; bool do_cache, no_policy; u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return -EINVAL; } err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); goto cleanup; } do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && skb->protocol == htons(ETH_P_IP)) { __be32 gw; gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; if (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, gw)) IPCB(skb)->flags |= IPSKB_DOREDIRECT; } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. * * Proxy arp feature have been extended to allow, ARP * replies back to the same interface, to support * Private VLAN switch technologies. See arp.c. */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { err = -EINVAL; goto cleanup; } } no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); if (no_policy) IPCB(skb)->flags |= IPSKB_NOPOLICY; fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; } } rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; goto cleanup; } rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: err = 0; cleanup: return err; } #ifdef CONFIG_IP_ROUTE_MULTIPATH /* To make ICMP packets follow the right flow, the multipath hash is * calculated from the inner IP addresses. */ static void ip_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); const struct iphdr *key_iph = outer_iph; const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; struct icmphdr _icmph; if (likely(outer_iph->protocol != IPPROTO_ICMP)) goto out; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) goto out; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) goto out; if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) goto out; key_iph = inner_iph; out: hash_keys->addrs.v4addrs.src = key_iph->saddr; hash_keys->addrs.v4addrs.dst = key_iph->daddr; } static u32 fib_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); return flow_hash_from_keys(&hash_keys); } static u32 fib_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again. */ if (!has_inner) return 0; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, 0); if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) return 0; if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; return flow_hash_from_keys(&hash_keys); } static u32 fib_multipath_custom_hash_skb(const struct net *net, const struct sk_buff *skb) { u32 mhash, mhash_inner; bool has_inner = true; mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner); mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner); return jhash_2words(mhash, mhash_inner, 0); } static u32 fib_multipath_custom_hash_fl4(const struct net *net, const struct flowi4 *fl4) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = fl4->saddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = fl4->fl4_sport; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; return flow_hash_from_keys(&hash_keys); } /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; u32 mhash = 0; switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (skb) { ip_multipath_l3_keys(skb, &hash_keys); } else { hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = flow_hash_from_keys(&hash_keys); break; case 1: /* skb is currently provided only when forwarding */ if (skb) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; struct flow_keys keys; /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; hash_keys.ports.src = flkeys->ports.src; hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } mhash = flow_hash_from_keys(&hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); /* skb is currently provided only when forwarding */ if (skb) { struct flow_keys keys; skb_flow_dissect_flow_keys(skb, &keys, 0); /* Inner can be v4 or v6 */ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; hash_keys.tags.flow_label = keys.tags.flow_label; hash_keys.basic.ip_proto = keys.basic.ip_proto; } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; ip_multipath_l3_keys(skb, &hash_keys); } } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = flow_hash_from_keys(&hash_keys); break; case 3: if (skb) mhash = fib_multipath_custom_hash_skb(net, skb); else mhash = fib_multipath_custom_hash_fl4(net, fl4); break; } if (multipath_hash) mhash = jhash_2words(mhash, multipath_hash, 0); return mhash >> 1; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif /* create a routing cache entry */ return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } /* Implements all the saddr-related checks as ip_route_input_slow(), * assuming daddr is valid and the destination is not a local broadcast one. * Uses the provided hint instead of performing a route lookup. */ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, const struct sk_buff *hint) { struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); int err = -EINVAL; u32 tag = 0; if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; if (ipv4_is_zeronet(saddr)) goto martian_source; if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) goto martian_source; if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; tos &= IPTOS_RT_MASK; err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); if (err < 0) goto martian_source; skip_validate_source: skb_dst_copy(skb, hint); return 0; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); return err; } /* get device for dst_alloc with local routes */ static struct net_device *ip_rt_get_dev(struct net *net, const struct fib_result *res) { struct fib_nh_common *nhc = res->fi ? res->nhc : NULL; struct net_device *dev = NULL; if (nhc) dev = l3mdev_master_dev_rcu(nhc->nhc_dev); return dev ? : net->loopback_dev; } /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. * Changes in the enforced policies must be applied also to * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. * called with rcu_read_lock() */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, struct fib_result *res) { struct in_device *in_dev = __in_dev_get_rcu(dev); struct flow_keys *flkeys = NULL, _flkeys; struct net *net = dev_net(dev); struct ip_tunnel_info *tun_info; int err = -EINVAL; unsigned int flags = 0; u32 itag = 0; struct rtable *rth; struct flowi4 fl4; bool do_cache = true; bool no_policy; /* IP on this device is disabled. */ if (!in_dev) goto out; /* Check for the most weird martians, which can be not detected * by fib_lookup. */ tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; res->fi = NULL; res->table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ if (ipv4_is_zeronet(saddr)) goto martian_source; if (ipv4_is_zeronet(daddr)) goto martian_destination; /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), * and call it once if daddr or/and saddr are loopback addresses */ if (ipv4_is_loopback(daddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) goto martian_destination; } else if (ipv4_is_loopback(saddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) goto martian_source; } /* * Now we are ready to route packet. */ fl4.flowi4_l3mdev = 0; fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; } else { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; goto no_route; } if (res->type == RTN_BROADCAST) { if (IN_DEV_BFORWARD(in_dev)) goto make_route; /* not do cache if bc_forwarding is enabled */ if (IPV4_DEVCONF_ALL(net, BC_FORWARDING)) do_cache = false; goto brd_input; } if (res->type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &itag); if (err < 0) goto martian_source; goto local_input; } if (!IN_DEV_FORWARD(in_dev)) { err = -EHOSTUNREACH; goto no_route; } if (res->type != RTN_UNICAST) goto martian_destination; make_route: err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); out: return err; brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; if (!ipv4_is_zeronet(saddr)) { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, in_dev, &itag); if (err < 0) goto martian_source; } flags |= RTCF_BROADCAST; res->type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); if (no_policy) IPCB(skb)->flags |= IPSKB_NOPOLICY; do_cache &= res->fi && !itag; if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; goto out; } } rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, no_policy, false); if (!rth) goto e_nobufs; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { WARN_ON(rth->dst.input == lwtunnel_input); rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } if (unlikely(!rt_cache_route(nhc, rth))) rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); err = 0; goto out; no_route: RT_CACHE_STAT_INC(in_no_route); res->type = RTN_UNREACHABLE; res->fi = NULL; res->table = NULL; goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", &daddr, &saddr, dev->name); #endif e_inval: err = -EINVAL; goto out; e_nobufs: err = -ENOBUFS; goto out; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct fib_result res; int err; tos &= IPTOS_RT_MASK; rcu_read_lock(); err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); rcu_read_unlock(); return err; } EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock held */ int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing * hardware multicast filters :-( As result the host on multicasting * network acquires a lot of useless route cache entries, sort of * SDR messages from all the world. Now we try to get rid of them. * Really, provided software IP multicast filter is organized * reasonably (at least, hashed), it does not result in a slowdown * comparing with route cache reject entries. * Note, that multicast routers are not affected, because * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; int err = -EINVAL; if (!in_dev) return err; our = ip_check_mc_rcu(in_dev, daddr, saddr, ip_hdr(skb)->protocol); /* check l3 master if no match yet */ if (!our && netif_is_l3_slave(dev)) { struct in_device *l3_in_dev; l3_in_dev = __in_dev_get_rcu(skb->dev); if (l3_in_dev) our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, ip_hdr(skb)->protocol); } if (our #ifdef CONFIG_IP_MROUTE || (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ) { err = ip_route_input_mc(skb, daddr, saddr, tos, dev, our); } return err; } return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); } /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, int orig_oif, struct net_device *dev_out, unsigned int flags) { struct fib_info *fi = res->fi; struct fib_nh_exception *fnhe; struct in_device *in_dev; u16 type = res->type; struct rtable *rth; bool do_cache; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) return ERR_PTR(-EINVAL); if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl4->daddr)) type = RTN_BROADCAST; else if (ipv4_is_multicast(fl4->daddr)) type = RTN_MULTICAST; else if (ipv4_is_zeronet(fl4->daddr)) return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; do_cache = true; if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; fi = NULL; } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; else do_cache = false; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. */ if (fi && res->prefixlen < 4) fi = NULL; } else if ((type == RTN_LOCAL) && (orig_oif != 0) && (orig_oif != dev_out->ifindex)) { /* For local routes that require a particular output interface * we do not want to cache the result. Caching the result * causes incorrect behaviour when there are multiple source * addresses on the interface, the end result being that if the * intended recipient is waiting on that interface for the * packet he won't receive it because it will be delivered on * the loopback interface and the IP_PKTINFO ipi_ifindex will * be set to the loopback interface as well. */ do_cache = false; } fnhe = NULL; do_cache &= fi != NULL; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct rtable __rcu **prth; fnhe = find_exception(nhc, fl4->daddr); if (!do_cache) goto add; if (fnhe) { prth = &fnhe->fnhe_rth_output; } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && !(nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; } prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } rth = rcu_dereference(*prth); if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) return rth; } add: rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_ORCONF(in_dev, NOPOLICY), IN_DEV_ORCONF(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); rth->rt_iif = orig_oif; RT_CACHE_STAT_INC(out_slow_tot); if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mc_output; } } #endif } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); lwtunnel_set_redirect(&rth->dst); return rth; } /* * Major route resolver routine. */ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, const struct sk_buff *skb) { struct fib_result res = { .type = RTN_UNSPEC, .fi = NULL, .table = NULL, .tclassid = 0, }; struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; ip_rt_fix_tos(fl4); rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); rcu_read_unlock(); return rth; } EXPORT_SYMBOL_GPL(ip_route_output_key_hash); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, struct fib_result *res, const struct sk_buff *skb) { struct net_device *dev_out = NULL; int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; int err; if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr) || ipv4_is_zeronet(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. * It was wrong for two reasons: * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr * is assigned to multiple interfaces. * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK */ if (fl4->flowi4_oif == 0 && (ipv4_is_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = __ip_dev_find(net, fl4->saddr, false); if (!dev_out) goto out; /* Special hack: user can direct multicasts * and limited broadcast via necessary interface * without fiddling with IP_MULTICAST_IF or IP_PKTINFO. * This hack is not just for fun, it allows * vic,vat and friends to work. * They bind socket to loopback, set ttl to zero * and expect that it will work. * From the viewpoint of routing cache they are broken, * because we are not allowed to build multicast path * with loopback source addr (look, routing cache * cannot know, that ttl is zero, so that packet * will not leave this host and route is valid). * Luckily, this hack is good workaround. */ fl4->flowi4_oif = dev_out->ifindex; goto make_route; } if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ if (!__ip_dev_find(net, fl4->saddr, false)) goto out; } } if (fl4->flowi4_oif) { dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); rth = ERR_PTR(-ENODEV); if (!dev_out) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { rth = ERR_PTR(-ENETUNREACH); goto out; } if (ipv4_is_local_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr) || fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (!fl4->saddr) { if (ipv4_is_multicast(fl4->daddr)) fl4->saddr = inet_select_addr(dev_out, 0, fl4->flowi4_scope); else if (!fl4->daddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } if (!fl4->daddr) { fl4->daddr = fl4->saddr; if (!fl4->daddr) fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = LOOPBACK_IFINDEX; res->type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } err = fib_lookup(net, fl4, res, 0); if (err) { res->fi = NULL; res->table = NULL; if (fl4->flowi4_oif && (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) { /* Apparently, routing tables are wrong. Assume, * that the destination is on link. * * WHY? DW. * Because we are allowed to send to iface * even if it has NO routes and NO assigned * addresses. When oif is specified, routing * tables are looked up with only one purpose: * to catch if destination is gatewayed, rather than * direct. Moreover, if MSG_DONTROUTE is set, * we send packet, ignoring both routing tables * and ifaddr state. --ANK * * * We could make it even if oif is unknown, * likely IPv6, but we do not. */ if (fl4->saddr == 0) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res->type = RTN_UNICAST; goto make_route; } rth = ERR_PTR(err); goto out; } if (res->type == RTN_LOCAL) { if (!fl4->saddr) { if (res->fi->fib_prefsrc) fl4->saddr = res->fi->fib_prefsrc; else fl4->saddr = fl4->daddr; } /* L3 master device is the loopback for that domain */ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : net->loopback_dev; /* make sure orig_oif points to fib result device even * though packet rx/tx happens over loopback or l3mdev */ orig_oif = FIB_RES_OIF(*res); fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } fib_select_path(net, res, fl4, skb); dev_out = FIB_RES_DEV(*res); make_route: rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); out: return rth; } static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .default_advmss = ipv4_default_advmss, .neigh_lookup = ipv4_neigh_lookup, .check = dst_blackhole_check, .cow_metrics = dst_blackhole_cow_metrics, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rtable *ort = (struct rtable *) dst_orig; struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_out; new->dev = net->loopback_dev; dev_hold(new->dev); rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; rt->rt_mtu_locked = ort->rt_mtu_locked; rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; else if (rt->rt_gw_family == AF_INET6) rt->rt_gw6 = ort->rt_gw6; INIT_LIST_HEAD(&rt->rt_uncached); } dst_release(dst_orig); return rt ? &rt->dst : ERR_PTR(-ENOMEM); } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); if (IS_ERR(rt)) return rt; if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0); } return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); struct rtable *ip_route_output_tunnel(struct sk_buff *skb, struct net_device *dev, struct net *net, __be32 *saddr, const struct ip_tunnel_info *info, u8 protocol, bool use_cache) { #ifdef CONFIG_DST_CACHE struct dst_cache *dst_cache; #endif struct rtable *rt = NULL; struct flowi4 fl4; __u8 tos; #ifdef CONFIG_DST_CACHE dst_cache = (struct dst_cache *)&info->dst_cache; if (use_cache) { rt = dst_cache_get_ip4(dst_cache, saddr); if (rt) return rt; } #endif memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_mark = skb->mark; fl4.flowi4_proto = protocol; fl4.daddr = info->key.u.ipv4.dst; fl4.saddr = info->key.u.ipv4.src; tos = info->key.tos; fl4.flowi4_tos = RT_TOS(tos); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr); return ERR_PTR(-ENETUNREACH); } if (rt->dst.dev == dev) { /* is this necessary? */ netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr); ip_rt_put(rt); return ERR_PTR(-ELOOP); } #ifdef CONFIG_DST_CACHE if (use_cache) dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); #endif *saddr = fl4.saddr; return rt; } EXPORT_SYMBOL_GPL(ip_route_output_tunnel); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; u32 error; u32 metrics[RTAX_MAX]; nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = fl4 ? fl4->flowi4_tos : 0; r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; if (IPCB(skb)->flags & IPSKB_DOREDIRECT) r->rtm_flags |= RTCF_DOREDIRECT; if (nla_put_in_addr(skb, RTA_DST, dst)) goto nla_put_failure; if (src) { r->rtm_src_len = 32; if (nla_put_in_addr(skb, RTA_SRC, src)) goto nla_put_failure; } if (rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) goto nla_put_failure; if (rt->dst.lwtstate && lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid && nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif if (fl4 && !rt_is_input_route(rt) && fl4->saddr != src) { if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } if (rt->rt_uses_gateway) { if (rt->rt_gw_family == AF_INET && nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; } else if (rt->rt_gw_family == AF_INET6) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) goto nla_put_failure; via = nla_data(nla); via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &rt->rt_gw6, alen); } } expires = rt->dst.expires; if (expires) { unsigned long now = jiffies; if (time_before(now, expires)) expires -= now; else expires = 0; } memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; if (rt->rt_mtu_locked && expires) metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (fl4) { if (fl4->flowi4_mark && nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && nla_put_u32(skb, RTA_UID, from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) goto nla_put_failure; if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, r, portid); if (err <= 0) { if (err == 0) return 0; goto nla_put_failure; } } else #endif if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) goto nla_put_failure; } } error = rt->dst.error; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fnhe_hash_bucket *bucket, int genid, int *fa_index, int fa_start, unsigned int flags) { int i; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; for (fnhe = rcu_dereference(bucket[i].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { struct rtable *rt; int err; if (*fa_index < fa_start) goto next; if (fnhe->fnhe_genid != genid) goto next; if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) goto next; rt = rcu_dereference(fnhe->fnhe_rth_input); if (!rt) rt = rcu_dereference(fnhe->fnhe_rth_output); if (!rt) goto next; err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, table_id, NULL, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) return err; next: (*fa_index)++; } } return 0; } int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, int *fa_index, int fa_start, unsigned int flags) { struct net *net = sock_net(cb->skb->sk); int nhsel, genid = fnhe_genid(net); for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); struct fnhe_hash_bucket *bucket; int err; if (nhc->nhc_flags & RTNH_F_DEAD) continue; rcu_read_lock(); bucket = rcu_dereference(nhc->nhc_exceptions); err = 0; if (bucket) err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, genid, fa_index, fa_start, flags); rcu_read_unlock(); if (err) return err; } return 0; } static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, u8 ip_proto, __be16 sport, __be16 dport) { struct sk_buff *skb; struct iphdr *iph; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return NULL; /* Reserve room for dummy headers, this skb can pass * through good chunk of routing engine. */ skb_reset_mac_header(skb); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); iph = skb_put(skb, sizeof(struct iphdr)); iph->protocol = ip_proto; iph->saddr = src; iph->daddr = dst; iph->version = 0x4; iph->frag_off = 0; iph->ihl = 0x5; skb_set_transport_header(skb, skb->len); switch (iph->protocol) { case IPPROTO_UDP: { struct udphdr *udph; udph = skb_put_zero(skb, sizeof(struct udphdr)); udph->source = sport; udph->dest = dport; udph->len = htons(sizeof(struct udphdr)); udph->check = 0; break; } case IPPROTO_TCP: { struct tcphdr *tcph; tcph = skb_put_zero(skb, sizeof(struct tcphdr)); tcph->source = sport; tcph->dest = dport; tcph->doff = sizeof(struct tcphdr) / 4; tcph->rst = 1; tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), src, dst, 0); break; } case IPPROTO_ICMP: { struct icmphdr *icmph; icmph = skb_put_zero(skb, sizeof(struct icmphdr)); icmph->type = ICMP_ECHO; icmph->code = 0; } } return skb; } static int inet_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_NOTIFY | RTM_F_LOOKUP_TABLE | RTM_F_FIB_MATCH)) { NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_IIF: case RTA_OIF: case RTA_SRC: case RTA_DST: case RTA_IP_PROTO: case RTA_SPORT: case RTA_DPORT: case RTA_MARK: case RTA_UID: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); return -EINVAL; } } return 0; } static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; u32 table_id = RT_TABLE_MAIN; __be16 sport = 0, dport = 0; struct fib_result res = {}; u8 ip_proto = IPPROTO_UDP; struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; kuid_t uid; u32 iif; int err; int mark; err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; rtm = nlmsg_data(nlh); src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; if (tb[RTA_UID]) uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else uid = (iif ? INVALID_UID : current_uid()); if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], &ip_proto, AF_INET, extack); if (err) return err; } if (tb[RTA_SPORT]) sport = nla_get_be16(tb[RTA_SPORT]); if (tb[RTA_DPORT]) dport = nla_get_be16(tb[RTA_DPORT]); skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); if (!skb) return -ENOBUFS; fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; if (sport) fl4.fl4_sport = sport; if (dport) fl4.fl4_dport = dport; fl4.flowi4_proto = ip_proto; rcu_read_lock(); if (iif) { struct net_device *dev; dev = dev_get_by_index_rcu(net, iif); if (!dev) { err = -ENODEV; goto errout_rcu; } fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos & IPTOS_RT_MASK, dev, &res); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { fl4.flowi4_iif = LOOPBACK_IFINDEX; skb->dev = net->loopback_dev; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); else skb_dst_set(skb, &rt->dst); } if (err) goto errout_rcu; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = res.table ? res.table->tb_id : 0; /* reset skb for netlink reply msg */ skb_trim(skb, 0); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { struct fib_rt_info fri; if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } fri.fi = res.fi; fri.tb_id = table_id; fri.dst = res.prefix; fri.dst_len = res.prefixlen; fri.tos = fl4.flowi4_tos; fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; fri.offload_failed = 0; if (res.fa_head) { struct fib_alias *fa; hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { u8 slen = 32 - fri.dst_len; if (fa->fa_slen == slen && fa->tb_id == fri.tb_id && fa->fa_tos == fri.tos && fa->fa_info == res.fi && fa->fa_type == fri.type) { fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); break; } } } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } if (err < 0) goto errout_rcu; rcu_read_unlock(); err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: return err; errout_rcu: rcu_read_unlock(); kfree_skb(skb); goto errout_free; } void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; if (write) { rt_cache_flush(net); fnhe_genid_bump(net); return 0; } return -EINVAL; } static struct ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ .procname = "gc_min_interval", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_min_interval_ms", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "gc_timeout", .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_interval", .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_number", .data = &ip_rt_redirect_number, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_silence", .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_cost", .data = &ip_rt_error_cost, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_burst", .data = &ip_rt_error_burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_elasticity", .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "min_adv_mss", .data = &ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; static const char ipv4_route_flush_procname[] = "flush"; static struct ctl_table ipv4_route_netns_table[] = { { .procname = ipv4_route_flush_procname, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, }, { .procname = "min_pmtu", .data = &init_net.ipv4.ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &ip_min_valid_pmtu, }, { .procname = "mtu_expires", .data = &init_net.ipv4.ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { }, }; static __net_init int sysctl_route_net_init(struct net *net) { struct ctl_table *tbl; tbl = ipv4_route_netns_table; if (!net_eq(net, &init_net)) { int i; tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL); if (!tbl) goto err_dup; /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { if (tbl[0].procname != ipv4_route_flush_procname) tbl[0].procname = NULL; } /* Update the variables to point into the current struct net * except for the first element flush */ for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++) tbl[i].data += (void *)net - (void *)&init_net; } tbl[0].extra1 = net; net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); if (!net->ipv4.route_hdr) goto err_reg; return 0; err_reg: if (tbl != ipv4_route_netns_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_route_net_exit(struct net *net) { struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); BUG_ON(tbl == ipv4_route_netns_table); kfree(tbl); } static __net_initdata struct pernet_operations sysctl_route_ops = { .init = sysctl_route_net_init, .exit = sysctl_route_net_exit, }; #endif static __net_init int netns_ip_rt_init(struct net *net) { /* Set default value for namespaceified sysctls */ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU; net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES; return 0; } static struct pernet_operations __net_initdata ip_rt_ops = { .init = netns_ip_rt_init, }; static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); return 0; } static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, }; static int __net_init ipv4_inetpeer_init(struct net *net) { struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; inet_peer_base_init(bp); net->ipv4.peers = bp; return 0; } static void __net_exit ipv4_inetpeer_exit(struct net *net) { struct inet_peer_base *bp = net->ipv4.peers; net->ipv4.peers = NULL; inetpeer_invalidate_tree(bp); kfree(bp); } static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { .init = ipv4_inetpeer_init, .exit = ipv4_inetpeer_exit, }; #ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ int __init ip_rt_init(void) { void *idents_hash; int cpu; /* For modern hosts, this will use 2 MB of memory */ idents_hash = alloc_large_system_hash("IP idents", sizeof(*ip_idents) + sizeof(*ip_tstamps), 0, 16, /* one bucket per 64 KB */ HASH_ZERO, NULL, &ip_idents_mask, 2048, 256*1024); ip_idents = idents_hash; prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); } #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; if (dst_entries_init(&ipv4_dst_ops) < 0) panic("IP: failed to allocate ipv4_dst_ops counter\n"); if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); ipv4_dst_ops.gc_thresh = ~0; ip_rt_max_size = INT_MAX; devinet_init(); ip_fib_init(); if (ip_rt_proc_init()) pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(); #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, RTNL_FLAG_DOIT_UNLOCKED); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif register_pernet_subsys(&ip_rt_ops); register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); return 0; } #ifdef CONFIG_SYSCTL /* * We really need to sanitize the damn ipv4 init order, then all * this nonsense will go away. */ void __init ip_static_sysctl_init(void) { register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); } #endif |
80 116 58 63 63 36 36 4 32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 | /* * net/tipc/name_distr.c: TIPC name distribution code * * Copyright (c) 2000-2006, 2014-2019, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "name_distr.h" int sysctl_tipc_named_timeout __read_mostly = 2000; struct distr_queue_item { struct distr_item i; u32 dtype; u32 node; unsigned long expires; struct list_head next; }; /** * publ_to_item - add publication info to a publication message * @p: publication info * @i: location of item in the message */ static void publ_to_item(struct distr_item *i, struct publication *p) { i->type = htonl(p->sr.type); i->lower = htonl(p->sr.lower); i->upper = htonl(p->sr.upper); i->port = htonl(p->sk.ref); i->key = htonl(p->key); } /** * named_prepare_buf - allocate & initialize a publication message * @net: the associated network namespace * @type: message type * @size: payload size * @dest: destination node * * The buffer returned is of size INT_H_SIZE + payload size */ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) { struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); u32 self = tipc_own_addr(net); struct tipc_msg *msg; if (buf != NULL) { msg = buf_msg(buf); tipc_msg_init(self, msg, NAME_DISTRIBUTOR, type, INT_H_SIZE, dest); msg_set_size(msg, INT_H_SIZE + size); } return buf; } /** * tipc_named_publish - tell other nodes about a new publication by this node * @net: the associated network namespace * @p: the new publication */ struct sk_buff *tipc_named_publish(struct net *net, struct publication *p) { struct name_table *nt = tipc_name_table(net); struct distr_item *item; struct sk_buff *skb; if (p->scope == TIPC_NODE_SCOPE) { list_add_tail_rcu(&p->binding_node, &nt->node_scope); return NULL; } write_lock_bh(&nt->cluster_scope_lock); list_add_tail(&p->binding_node, &nt->cluster_scope); write_unlock_bh(&nt->cluster_scope_lock); skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); if (!skb) { pr_warn("Publication distribution failure\n"); return NULL; } msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); msg_set_non_legacy(buf_msg(skb)); item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, p); return skb; } /** * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node * @net: the associated network namespace * @p: the withdrawn publication */ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *p) { struct name_table *nt = tipc_name_table(net); struct distr_item *item; struct sk_buff *skb; write_lock_bh(&nt->cluster_scope_lock); list_del(&p->binding_node); write_unlock_bh(&nt->cluster_scope_lock); if (p->scope == TIPC_NODE_SCOPE) return NULL; skb = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0); if (!skb) { pr_warn("Withdrawal distribution failure\n"); return NULL; } msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); msg_set_non_legacy(buf_msg(skb)); item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, p); return skb; } /** * named_distribute - prepare name info for bulk distribution to another node * @net: the associated network namespace * @list: list of messages (buffers) to be returned from this function * @dnode: node to be updated * @pls: linked list of publication items to be packed into buffer chain * @seqno: sequence number for this message */ static void named_distribute(struct net *net, struct sk_buff_head *list, u32 dnode, struct list_head *pls, u16 seqno) { struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; struct tipc_msg *hdr; list_for_each_entry(publ, pls, binding_node) { /* Prepare next buffer: */ if (!skb) { skb = named_prepare_buf(net, PUBLICATION, msg_rem, dnode); if (!skb) { pr_warn("Bulk publication failure\n"); return; } hdr = buf_msg(skb); msg_set_bc_ack_invalid(hdr, true); msg_set_bulk(hdr); msg_set_non_legacy(hdr); item = (struct distr_item *)msg_data(hdr); } /* Pack publication into message: */ publ_to_item(item, publ); item++; msg_rem -= ITEM_SIZE; /* Append full buffer to list: */ if (!msg_rem) { __skb_queue_tail(list, skb); skb = NULL; msg_rem = msg_dsz; } } if (skb) { hdr = buf_msg(skb); msg_set_size(hdr, INT_H_SIZE + (msg_dsz - msg_rem)); skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem)); __skb_queue_tail(list, skb); } hdr = buf_msg(skb_peek_tail(list)); msg_set_last_bulk(hdr); msg_set_named_seqno(hdr, seqno); } /** * tipc_named_node_up - tell specified node about all publications by this node * @net: the associated network namespace * @dnode: destination node * @capabilities: peer node's capabilities */ void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct sk_buff_head head; u16 seqno; __skb_queue_head_init(&head); spin_lock_bh(&tn->nametbl_lock); if (!(capabilities & TIPC_NAMED_BCAST)) nt->rc_dests++; seqno = nt->snd_nxt; spin_unlock_bh(&tn->nametbl_lock); read_lock_bh(&nt->cluster_scope_lock); named_distribute(net, &head, dnode, &nt->cluster_scope, seqno); tipc_node_xmit(net, &head, dnode, 0); read_unlock_bh(&nt->cluster_scope_lock); } /** * tipc_publ_purge - remove publication associated with a failed node * @net: the associated network namespace * @p: the publication to remove * @addr: failed node's address * * Invoked for each publication issued by a newly failed node. * Removes publication structure from name table & deletes it. */ static void tipc_publ_purge(struct net *net, struct publication *p, u32 addr) { struct tipc_net *tn = tipc_net(net); struct publication *_p; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, p->scope, p->sr.type, p->sr.lower, p->sr.upper); spin_lock_bh(&tn->nametbl_lock); _p = tipc_nametbl_remove_publ(net, &ua, &p->sk, p->key); if (_p) tipc_node_unsubscribe(net, &_p->binding_node, addr); spin_unlock_bh(&tn->nametbl_lock); if (_p) kfree_rcu(_p, rcu); } void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr, u16 capabilities) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *publ, *tmp; list_for_each_entry_safe(publ, tmp, nsub_list, binding_node) tipc_publ_purge(net, publ, addr); spin_lock_bh(&tn->nametbl_lock); if (!(capabilities & TIPC_NAMED_BCAST)) nt->rc_dests--; spin_unlock_bh(&tn->nametbl_lock); } /** * tipc_update_nametbl - try to process a nametable update and notify * subscribers * @net: the associated network namespace * @i: location of item in the message * @node: node address * @dtype: name distributor message type * * tipc_nametbl_lock must be held. * Return: the publication item if successful, otherwise NULL. */ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, u32 node, u32 dtype) { struct publication *p = NULL; struct tipc_socket_addr sk; struct tipc_uaddr ua; u32 key = ntohl(i->key); tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE, ntohl(i->type), ntohl(i->lower), ntohl(i->upper)); sk.ref = ntohl(i->port); sk.node = node; if (dtype == PUBLICATION) { p = tipc_nametbl_insert_publ(net, &ua, &sk, key); if (p) { tipc_node_subscribe(net, &p->binding_node, node); return true; } } else if (dtype == WITHDRAWAL) { p = tipc_nametbl_remove_publ(net, &ua, &sk, key); if (p) { tipc_node_unsubscribe(net, &p->binding_node, node); kfree_rcu(p, rcu); return true; } pr_warn_ratelimited("Failed to remove binding %u,%u from %u\n", ua.sr.type, ua.sr.lower, node); } else { pr_warn_ratelimited("Unknown name table message received\n"); } return false; } static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, u16 *rcv_nxt, bool *open) { struct sk_buff *skb, *tmp; struct tipc_msg *hdr; u16 seqno; spin_lock_bh(&namedq->lock); skb_queue_walk_safe(namedq, skb, tmp) { if (unlikely(skb_linearize(skb))) { __skb_unlink(skb, namedq); kfree_skb(skb); continue; } hdr = buf_msg(skb); seqno = msg_named_seqno(hdr); if (msg_is_last_bulk(hdr)) { *rcv_nxt = seqno; *open = true; } if (msg_is_bulk(hdr) || msg_is_legacy(hdr)) { __skb_unlink(skb, namedq); spin_unlock_bh(&namedq->lock); return skb; } if (*open && (*rcv_nxt == seqno)) { (*rcv_nxt)++; __skb_unlink(skb, namedq); spin_unlock_bh(&namedq->lock); return skb; } if (less(seqno, *rcv_nxt)) { __skb_unlink(skb, namedq); kfree_skb(skb); continue; } } spin_unlock_bh(&namedq->lock); return NULL; } /** * tipc_named_rcv - process name table update messages sent by another node * @net: the associated network namespace * @namedq: queue to receive from * @rcv_nxt: store last received seqno here * @open: last bulk msg was received (FIXME) */ void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, u16 *rcv_nxt, bool *open) { struct tipc_net *tn = tipc_net(net); struct distr_item *item; struct tipc_msg *hdr; struct sk_buff *skb; u32 count, node; spin_lock_bh(&tn->nametbl_lock); while ((skb = tipc_named_dequeue(namedq, rcv_nxt, open))) { hdr = buf_msg(skb); node = msg_orignode(hdr); item = (struct distr_item *)msg_data(hdr); count = msg_data_sz(hdr) / ITEM_SIZE; while (count--) { tipc_update_nametbl(net, item, node, msg_type(hdr)); item++; } kfree_skb(skb); } spin_unlock_bh(&tn->nametbl_lock); } /** * tipc_named_reinit - re-initialize local publications * @net: the associated network namespace * * This routine is called whenever TIPC networking is enabled. * All name table entries published by this node are updated to reflect * the node's new network address. */ void tipc_named_reinit(struct net *net) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *p; u32 self = tipc_own_addr(net); spin_lock_bh(&tn->nametbl_lock); list_for_each_entry_rcu(p, &nt->node_scope, binding_node) p->sk.node = self; list_for_each_entry_rcu(p, &nt->cluster_scope, binding_node) p->sk.node = self; nt->rc_dests = 0; spin_unlock_bh(&tn->nametbl_lock); } |
3 4 4 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "main.h" #include <linux/errno.h> #include <linux/list.h> #include <linux/moduleparam.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/string.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "netlink.h" char batadv_routing_algo[20] = "BATMAN_IV"; static struct hlist_head batadv_algo_list; /** * batadv_algo_init() - Initialize batman-adv algorithm management data * structures */ void batadv_algo_init(void) { INIT_HLIST_HEAD(&batadv_algo_list); } /** * batadv_algo_get() - Search for algorithm with specific name * @name: algorithm name to find * * Return: Pointer to batadv_algo_ops on success, NULL otherwise */ struct batadv_algo_ops *batadv_algo_get(const char *name) { struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp; hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) { if (strcmp(bat_algo_ops_tmp->name, name) != 0) continue; bat_algo_ops = bat_algo_ops_tmp; break; } return bat_algo_ops; } /** * batadv_algo_register() - Register callbacks for a mesh algorithm * @bat_algo_ops: mesh algorithm callbacks to add * * Return: 0 on success or negative error number in case of failure */ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) { struct batadv_algo_ops *bat_algo_ops_tmp; bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); if (bat_algo_ops_tmp) { pr_info("Trying to register already registered routing algorithm: %s\n", bat_algo_ops->name); return -EEXIST; } /* all algorithms must implement all ops (for now) */ if (!bat_algo_ops->iface.enable || !bat_algo_ops->iface.disable || !bat_algo_ops->iface.update_mac || !bat_algo_ops->iface.primary_set || !bat_algo_ops->neigh.cmp || !bat_algo_ops->neigh.is_similar_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); return -EINVAL; } INIT_HLIST_NODE(&bat_algo_ops->list); hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); return 0; } /** * batadv_algo_select() - Select algorithm of soft interface * @bat_priv: the bat priv with all the soft interface information * @name: name of the algorithm to select * * The algorithm callbacks for the soft interface will be set when the algorithm * with the correct name was found. Any previous selected algorithm will not be * deinitialized and the new selected algorithm will also not be initialized. * It is therefore not allowed to call batadv_algo_select outside the creation * function of the soft interface. * * Return: 0 on success or negative error number in case of failure */ int batadv_algo_select(struct batadv_priv *bat_priv, const char *name) { struct batadv_algo_ops *bat_algo_ops; bat_algo_ops = batadv_algo_get(name); if (!bat_algo_ops) return -EINVAL; bat_priv->algo_ops = bat_algo_ops; return 0; } static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) { struct batadv_algo_ops *bat_algo_ops; char *algo_name = (char *)val; size_t name_len = strlen(algo_name); if (name_len > 0 && algo_name[name_len - 1] == '\n') algo_name[name_len - 1] = '\0'; bat_algo_ops = batadv_algo_get(algo_name); if (!bat_algo_ops) { pr_err("Routing algorithm '%s' is not supported\n", algo_name); return -EINVAL; } return param_set_copystring(algo_name, kp); } static const struct kernel_param_ops batadv_param_ops_ra = { .set = batadv_param_set_ra, .get = param_get_string, }; static struct kparam_string batadv_param_string_ra = { .maxlen = sizeof(batadv_routing_algo), .string = batadv_routing_algo, }; module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, 0644); /** * batadv_algo_dump_entry() - fill in information about one supported routing * algorithm * @msg: netlink message to be sent back * @portid: Port to reply to * @seq: Sequence number of message * @bat_algo_ops: Algorithm to be dumped * * Return: Error number, or 0 on success */ static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_algo_ops *bat_algo_ops) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ROUTING_ALGOS); if (!hdr) return -EMSGSIZE; if (nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_algo_ops->name)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_algo_dump() - fill in information about supported routing * algorithms * @msg: netlink message to be sent back * @cb: Parameters to the netlink request * * Return: Length of reply message. */ int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_algo_ops *bat_algo_ops; int skip = cb->args[0]; int i = 0; hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { if (i++ < skip) continue; if (batadv_algo_dump_entry(msg, portid, cb->nlh->nlmsg_seq, bat_algo_ops)) { i--; break; } } cb->args[0] = i; return msg->len; } |
89 31 89 9 20 5 88 68 156 88 68 161 162 156 115 107 14 156 156 106 114 14 14 14 22 5 17 14 21 5 2 14 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 | // SPDX-License-Identifier: GPL-2.0-or-later /* Local endpoint object management * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/udp.h> #include <linux/ip.h> #include <linux/hashtable.h> #include <net/sock.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/af_rxrpc.h> #include "ar-internal.h" static void rxrpc_local_processor(struct work_struct *); static void rxrpc_local_rcu(struct rcu_head *); /* * Compare a local to an address. Return -ve, 0 or +ve to indicate less than, * same or greater than. * * We explicitly don't compare the RxRPC service ID as we want to reject * conflicting uses by differing services. Further, we don't want to share * addresses with different options (IPv6), so we don't compare those bits * either. */ static long rxrpc_local_cmp_key(const struct rxrpc_local *local, const struct sockaddr_rxrpc *srx) { long diff; diff = ((local->srx.transport_type - srx->transport_type) ?: (local->srx.transport_len - srx->transport_len) ?: (local->srx.transport.family - srx->transport.family)); if (diff != 0) return diff; switch (srx->transport.family) { case AF_INET: /* If the choice of UDP port is left up to the transport, then * the endpoint record doesn't match. */ return ((u16 __force)local->srx.transport.sin.sin_port - (u16 __force)srx->transport.sin.sin_port) ?: memcmp(&local->srx.transport.sin.sin_addr, &srx->transport.sin.sin_addr, sizeof(struct in_addr)); #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: /* If the choice of UDP6 port is left up to the transport, then * the endpoint record doesn't match. */ return ((u16 __force)local->srx.transport.sin6.sin6_port - (u16 __force)srx->transport.sin6.sin6_port) ?: memcmp(&local->srx.transport.sin6.sin6_addr, &srx->transport.sin6.sin6_addr, sizeof(struct in6_addr)); #endif default: BUG(); } } /* * Allocate a new local endpoint. */ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); if (local) { refcount_set(&local->ref, 1); atomic_set(&local->active_users, 1); local->rxnet = rxnet; INIT_HLIST_NODE(&local->link); INIT_WORK(&local->processor, rxrpc_local_processor); init_rwsem(&local->defrag_sem); skb_queue_head_init(&local->reject_queue); skb_queue_head_init(&local->event_queue); local->client_bundles = RB_ROOT; spin_lock_init(&local->client_bundles_lock); spin_lock_init(&local->lock); rwlock_init(&local->services_lock); local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, NULL); } _leave(" = %p", local); return local; } /* * create the local socket * - must be called with rxrpc_local_mutex locked */ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct sockaddr_rxrpc *srx = &local->srx; struct udp_port_cfg udp_conf = {0}; struct sock *usk; int ret; _enter("%p{%d,%d}", local, srx->transport_type, srx->transport.family); udp_conf.family = srx->transport.family; udp_conf.use_udp_checksums = true; if (udp_conf.family == AF_INET) { udp_conf.local_ip = srx->transport.sin.sin_addr; udp_conf.local_udp_port = srx->transport.sin.sin_port; #if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) } else { udp_conf.local_ip6 = srx->transport.sin6.sin6_addr; udp_conf.local_udp_port = srx->transport.sin6.sin6_port; udp_conf.use_udp6_tx_checksums = true; udp_conf.use_udp6_rx_checksums = true; #endif } ret = udp_sock_create(net, &udp_conf, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); return ret; } tuncfg.encap_type = UDP_ENCAP_RXRPC; tuncfg.encap_rcv = rxrpc_input_packet; tuncfg.encap_err_rcv = rxrpc_encap_err_rcv; tuncfg.sk_user_data = local; setup_udp_tunnel_sock(net, local->socket, &tuncfg); /* set the socket up */ usk = local->socket->sk; usk->sk_error_report = rxrpc_error_report; switch (srx->transport.family) { case AF_INET6: /* we want to receive ICMPv6 errors */ ip6_sock_set_recverr(usk); /* Fall through and set IPv4 options too otherwise we don't get * errors from IPv4 packets sent through the IPv6 socket. */ fallthrough; case AF_INET: /* we want to receive ICMP errors */ ip_sock_set_recverr(usk); /* we want to set the don't fragment bit */ ip_sock_set_mtu_discover(usk, IP_PMTUDISC_DO); /* We want receive timestamps. */ sock_enable_timestamps(usk); break; default: BUG(); } _leave(" = 0"); return 0; } /* * Look up or create a new local endpoint using the specified local address. */ struct rxrpc_local *rxrpc_lookup_local(struct net *net, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; struct rxrpc_net *rxnet = rxrpc_net(net); struct hlist_node *cursor; const char *age; long diff; int ret; _enter("{%d,%d,%pISp}", srx->transport_type, srx->transport.family, &srx->transport); mutex_lock(&rxnet->local_mutex); hlist_for_each(cursor, &rxnet->local_endpoints) { local = hlist_entry(cursor, struct rxrpc_local, link); diff = rxrpc_local_cmp_key(local, srx); if (diff != 0) continue; /* Services aren't allowed to share transport sockets, so * reject that here. It is possible that the object is dying - * but it may also still have the local transport address that * we want bound. */ if (srx->srx_service) { local = NULL; goto addr_in_use; } /* Found a match. We want to replace a dying object. * Attempting to bind the transport socket may still fail if * we're attempting to use a local address that the dying * object is still using. */ if (!rxrpc_use_local(local)) break; age = "old"; goto found; } local = rxrpc_alloc_local(rxnet, srx); if (!local) goto nomem; ret = rxrpc_open_socket(local, net); if (ret < 0) goto sock_error; if (cursor) { hlist_replace_rcu(cursor, &local->link); cursor->pprev = NULL; } else { hlist_add_head_rcu(&local->link, &rxnet->local_endpoints); } age = "new"; found: mutex_unlock(&rxnet->local_mutex); _net("LOCAL %s %d {%pISp}", age, local->debug_id, &local->srx.transport); _leave(" = %p", local); return local; nomem: ret = -ENOMEM; sock_error: mutex_unlock(&rxnet->local_mutex); if (local) call_rcu(&local->rcu, rxrpc_local_rcu); _leave(" = %d", ret); return ERR_PTR(ret); addr_in_use: mutex_unlock(&rxnet->local_mutex); _leave(" = -EADDRINUSE"); return ERR_PTR(-EADDRINUSE); } /* * Get a ref on a local endpoint. */ struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local) { const void *here = __builtin_return_address(0); int r; __refcount_inc(&local->ref, &r); trace_rxrpc_local(local->debug_id, rxrpc_local_got, r + 1, here); return local; } /* * Get a ref on a local endpoint unless its usage has already reached 0. */ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) { const void *here = __builtin_return_address(0); int r; if (local) { if (__refcount_inc_not_zero(&local->ref, &r)) trace_rxrpc_local(local->debug_id, rxrpc_local_got, r + 1, here); else local = NULL; } return local; } /* * Queue a local endpoint and pass the caller's reference to the work item. */ void rxrpc_queue_local(struct rxrpc_local *local) { const void *here = __builtin_return_address(0); unsigned int debug_id = local->debug_id; int r = refcount_read(&local->ref); if (rxrpc_queue_work(&local->processor)) trace_rxrpc_local(debug_id, rxrpc_local_queued, r + 1, here); else rxrpc_put_local(local); } /* * Drop a ref on a local endpoint. */ void rxrpc_put_local(struct rxrpc_local *local) { const void *here = __builtin_return_address(0); unsigned int debug_id; bool dead; int r; if (local) { debug_id = local->debug_id; dead = __refcount_dec_and_test(&local->ref, &r); trace_rxrpc_local(debug_id, rxrpc_local_put, r, here); if (dead) call_rcu(&local->rcu, rxrpc_local_rcu); } } /* * Start using a local endpoint. */ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local) { local = rxrpc_get_local_maybe(local); if (!local) return NULL; if (!__rxrpc_use_local(local)) { rxrpc_put_local(local); return NULL; } return local; } /* * Cease using a local endpoint. Once the number of active users reaches 0, we * start the closure of the transport in the work processor. */ void rxrpc_unuse_local(struct rxrpc_local *local) { if (local) { if (__rxrpc_unuse_local(local)) { rxrpc_get_local(local); rxrpc_queue_local(local); } } } /* * Destroy a local endpoint's socket and then hand the record to RCU to dispose * of. * * Closing the socket cannot be done from bottom half context or RCU callback * context because it might sleep. */ static void rxrpc_local_destroyer(struct rxrpc_local *local) { struct socket *socket = local->socket; struct rxrpc_net *rxnet = local->rxnet; _enter("%d", local->debug_id); local->dead = true; mutex_lock(&rxnet->local_mutex); hlist_del_init_rcu(&local->link); mutex_unlock(&rxnet->local_mutex); rxrpc_clean_up_local_conns(local); rxrpc_service_connection_reaper(&rxnet->service_conn_reaper); ASSERT(!local->service); if (socket) { local->socket = NULL; kernel_sock_shutdown(socket, SHUT_RDWR); socket->sk->sk_user_data = NULL; sock_release(socket); } /* At this point, there should be no more packets coming in to the * local endpoint. */ rxrpc_purge_queue(&local->reject_queue); rxrpc_purge_queue(&local->event_queue); } /* * Process events on an endpoint. The work item carries a ref which * we must release. */ static void rxrpc_local_processor(struct work_struct *work) { struct rxrpc_local *local = container_of(work, struct rxrpc_local, processor); bool again; if (local->dead) return; trace_rxrpc_local(local->debug_id, rxrpc_local_processing, refcount_read(&local->ref), NULL); do { again = false; if (!__rxrpc_use_local(local)) { rxrpc_local_destroyer(local); break; } if (!skb_queue_empty(&local->reject_queue)) { rxrpc_reject_packets(local); again = true; } if (!skb_queue_empty(&local->event_queue)) { rxrpc_process_local_events(local); again = true; } __rxrpc_unuse_local(local); } while (again); rxrpc_put_local(local); } /* * Destroy a local endpoint after the RCU grace period expires. */ static void rxrpc_local_rcu(struct rcu_head *rcu) { struct rxrpc_local *local = container_of(rcu, struct rxrpc_local, rcu); _enter("%d", local->debug_id); ASSERT(!work_pending(&local->processor)); _net("DESTROY LOCAL %d", local->debug_id); kfree(local); _leave(""); } /* * Verify the local endpoint list is empty by this point. */ void rxrpc_destroy_all_locals(struct rxrpc_net *rxnet) { struct rxrpc_local *local; _enter(""); flush_workqueue(rxrpc_workqueue); if (!hlist_empty(&rxnet->local_endpoints)) { mutex_lock(&rxnet->local_mutex); hlist_for_each_entry(local, &rxnet->local_endpoints, link) { pr_err("AF_RXRPC: Leaked local %p {%d}\n", local, refcount_read(&local->ref)); } mutex_unlock(&rxnet->local_mutex); BUG(); } } |
64 2 13 8 5 64 59 14 43 15 105 106 106 58 58 43 15 58 64 64 63 63 64 13 58 57 58 64 64 13 13 13 63 64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 | // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC remote transport endpoint record management * * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/udp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/slab.h> #include <linux/hashtable.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> #include <net/route.h> #include <net/ip6_route.h> #include "ar-internal.h" /* * Hash a peer key. */ static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local, const struct sockaddr_rxrpc *srx) { const u16 *p; unsigned int i, size; unsigned long hash_key; _enter(""); hash_key = (unsigned long)local / __alignof__(*local); hash_key += srx->transport_type; hash_key += srx->transport_len; hash_key += srx->transport.family; switch (srx->transport.family) { case AF_INET: hash_key += (u16 __force)srx->transport.sin.sin_port; size = sizeof(srx->transport.sin.sin_addr); p = (u16 *)&srx->transport.sin.sin_addr; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: hash_key += (u16 __force)srx->transport.sin.sin_port; size = sizeof(srx->transport.sin6.sin6_addr); p = (u16 *)&srx->transport.sin6.sin6_addr; break; #endif default: WARN(1, "AF_RXRPC: Unsupported transport address family\n"); return 0; } /* Step through the peer address in 16-bit portions for speed */ for (i = 0; i < size; i += sizeof(*p), p++) hash_key += *p; _leave(" 0x%lx", hash_key); return hash_key; } /* * Compare a peer to a key. Return -ve, 0 or +ve to indicate less than, same * or greater than. * * Unfortunately, the primitives in linux/hashtable.h don't allow for sorted * buckets and mid-bucket insertion, so we don't make full use of this * information at this point. */ static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer, struct rxrpc_local *local, const struct sockaddr_rxrpc *srx, unsigned long hash_key) { long diff; diff = ((peer->hash_key - hash_key) ?: ((unsigned long)peer->local - (unsigned long)local) ?: (peer->srx.transport_type - srx->transport_type) ?: (peer->srx.transport_len - srx->transport_len) ?: (peer->srx.transport.family - srx->transport.family)); if (diff != 0) return diff; switch (srx->transport.family) { case AF_INET: return ((u16 __force)peer->srx.transport.sin.sin_port - (u16 __force)srx->transport.sin.sin_port) ?: memcmp(&peer->srx.transport.sin.sin_addr, &srx->transport.sin.sin_addr, sizeof(struct in_addr)); #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: return ((u16 __force)peer->srx.transport.sin6.sin6_port - (u16 __force)srx->transport.sin6.sin6_port) ?: memcmp(&peer->srx.transport.sin6.sin6_addr, &srx->transport.sin6.sin6_addr, sizeof(struct in6_addr)); #endif default: BUG(); } } /* * Look up a remote transport endpoint for the specified address using RCU. */ static struct rxrpc_peer *__rxrpc_lookup_peer_rcu( struct rxrpc_local *local, const struct sockaddr_rxrpc *srx, unsigned long hash_key) { struct rxrpc_peer *peer; struct rxrpc_net *rxnet = local->rxnet; hash_for_each_possible_rcu(rxnet->peer_hash, peer, hash_link, hash_key) { if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0 && refcount_read(&peer->ref) > 0) return peer; } return NULL; } /* * Look up a remote transport endpoint for the specified address using RCU. */ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, const struct sockaddr_rxrpc *srx) { struct rxrpc_peer *peer; unsigned long hash_key = rxrpc_peer_hash_key(local, srx); peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer) { _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport); _leave(" = %p {u=%d}", peer, refcount_read(&peer->ref)); } return peer; } /* * assess the MTU size for the network interface through which this peer is * reached */ static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx, struct rxrpc_peer *peer) { struct net *net = sock_net(&rx->sk); struct dst_entry *dst; struct rtable *rt; struct flowi fl; struct flowi4 *fl4 = &fl.u.ip4; #ifdef CONFIG_AF_RXRPC_IPV6 struct flowi6 *fl6 = &fl.u.ip6; #endif peer->if_mtu = 1500; memset(&fl, 0, sizeof(fl)); switch (peer->srx.transport.family) { case AF_INET: rt = ip_route_output_ports( net, fl4, NULL, peer->srx.transport.sin.sin_addr.s_addr, 0, htons(7000), htons(7001), IPPROTO_UDP, 0, 0); if (IS_ERR(rt)) { _leave(" [route err %ld]", PTR_ERR(rt)); return; } dst = &rt->dst; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: fl6->flowi6_iif = LOOPBACK_IFINDEX; fl6->flowi6_scope = RT_SCOPE_UNIVERSE; fl6->flowi6_proto = IPPROTO_UDP; memcpy(&fl6->daddr, &peer->srx.transport.sin6.sin6_addr, sizeof(struct in6_addr)); fl6->fl6_dport = htons(7001); fl6->fl6_sport = htons(7000); dst = ip6_route_output(net, NULL, fl6); if (dst->error) { _leave(" [route err %d]", dst->error); return; } break; #endif default: BUG(); } peer->if_mtu = dst_mtu(dst); dst_release(dst); _leave(" [if_mtu %u]", peer->if_mtu); } /* * Allocate a peer. */ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) { const void *here = __builtin_return_address(0); struct rxrpc_peer *peer; _enter(""); peer = kzalloc(sizeof(struct rxrpc_peer), gfp); if (peer) { refcount_set(&peer->ref, 1); peer->local = rxrpc_get_local(local); INIT_HLIST_HEAD(&peer->error_targets); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); spin_lock_init(&peer->rtt_input_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); rxrpc_peer_init_rtt(peer); if (RXRPC_TX_SMSS > 2190) peer->cong_cwnd = 2; else if (RXRPC_TX_SMSS > 1095) peer->cong_cwnd = 3; else peer->cong_cwnd = 4; trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here); } _leave(" = %p", peer); return peer; } /* * Initialise peer record. */ static void rxrpc_init_peer(struct rxrpc_sock *rx, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; rxrpc_assess_MTU_size(rx, peer); peer->mtu = peer->if_mtu; peer->rtt_last_req = ktime_get_real(); switch (peer->srx.transport.family) { case AF_INET: peer->hdrsize = sizeof(struct iphdr); break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: peer->hdrsize = sizeof(struct ipv6hdr); break; #endif default: BUG(); } switch (peer->srx.transport_type) { case SOCK_DGRAM: peer->hdrsize += sizeof(struct udphdr); break; default: BUG(); } peer->hdrsize += sizeof(struct rxrpc_wire_header); peer->maxdata = peer->mtu - peer->hdrsize; } /* * Set up a new peer. */ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, struct sockaddr_rxrpc *srx, unsigned long hash_key, gfp_t gfp) { struct rxrpc_peer *peer; _enter(""); peer = rxrpc_alloc_peer(local, gfp); if (peer) { memcpy(&peer->srx, srx, sizeof(*srx)); rxrpc_init_peer(rx, peer, hash_key); } _leave(" = %p", peer); return peer; } static void rxrpc_free_peer(struct rxrpc_peer *peer) { rxrpc_put_local(peer->local); kfree_rcu(peer, rcu); } /* * Set up a new incoming peer. There shouldn't be any other matching peers * since we've already done a search in the list from the non-reentrant context * (the data_ready handler) that is the only place we can add new peers. */ void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, struct rxrpc_peer *peer) { struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key; hash_key = rxrpc_peer_hash_key(local, &peer->srx); rxrpc_init_peer(rx, peer, hash_key); spin_lock(&rxnet->peer_hash_lock); hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); spin_unlock(&rxnet->peer_hash_lock); } /* * obtain a remote transport endpoint for the specified address */ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_peer *peer, *candidate; struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key = rxrpc_peer_hash_key(local, srx); _enter("{%pISp}", &srx->transport); /* search the peer list first */ rcu_read_lock(); peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; rcu_read_unlock(); if (!peer) { /* The peer is not yet present in hash - create a candidate * for a new record and then redo the search. */ candidate = rxrpc_create_peer(rx, local, srx, hash_key, gfp); if (!candidate) { _leave(" = NULL [nomem]"); return NULL; } spin_lock_bh(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; if (!peer) { hash_add_rcu(rxnet->peer_hash, &candidate->hash_link, hash_key); list_add_tail(&candidate->keepalive_link, &rxnet->peer_keepalive_new); } spin_unlock_bh(&rxnet->peer_hash_lock); if (peer) rxrpc_free_peer(candidate); else peer = candidate; } _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport); _leave(" = %p {u=%d}", peer, refcount_read(&peer->ref)); return peer; } /* * Get a ref on a peer record. */ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); int r; __refcount_inc(&peer->ref, &r); trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, r + 1, here); return peer; } /* * Get a ref on a peer record unless its usage has already reached 0. */ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); int r; if (peer) { if (__refcount_inc_not_zero(&peer->ref, &r)) trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, r + 1, here); else peer = NULL; } return peer; } /* * Discard a peer record. */ static void __rxrpc_put_peer(struct rxrpc_peer *peer) { struct rxrpc_net *rxnet = peer->local->rxnet; ASSERT(hlist_empty(&peer->error_targets)); spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_free_peer(peer); } /* * Drop a ref on a peer record. */ void rxrpc_put_peer(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); unsigned int debug_id; bool dead; int r; if (peer) { debug_id = peer->debug_id; dead = __refcount_dec_and_test(&peer->ref, &r); trace_rxrpc_peer(debug_id, rxrpc_peer_put, r - 1, here); if (dead) __rxrpc_put_peer(peer); } } /* * Drop a ref on a peer record where the caller already holds the * peer_hash_lock. */ void rxrpc_put_peer_locked(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); unsigned int debug_id = peer->debug_id; bool dead; int r; dead = __refcount_dec_and_test(&peer->ref, &r); trace_rxrpc_peer(debug_id, rxrpc_peer_put, r - 1, here); if (dead) { hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); rxrpc_free_peer(peer); } } /* * Make sure all peer records have been discarded. */ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) { struct rxrpc_peer *peer; int i; for (i = 0; i < HASH_SIZE(rxnet->peer_hash); i++) { if (hlist_empty(&rxnet->peer_hash[i])) continue; hlist_for_each_entry(peer, &rxnet->peer_hash[i], hash_link) { pr_err("Leaked peer %u {%u} %pISp\n", peer->debug_id, refcount_read(&peer->ref), &peer->srx.transport); } } } /** * rxrpc_kernel_get_peer - Get the peer address of a call * @sock: The socket on which the call is in progress. * @call: The call to query * @_srx: Where to place the result * * Get the address of the remote peer in a call. */ void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, struct sockaddr_rxrpc *_srx) { *_srx = call->peer->srx; } EXPORT_SYMBOL(rxrpc_kernel_get_peer); /** * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT * @sock: The socket on which the call is in progress. * @call: The call to query * @_srtt: Where to store the SRTT value. * * Get the call's peer smoothed RTT in uS. */ bool rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call, u32 *_srtt) { struct rxrpc_peer *peer = call->peer; if (peer->rtt_count == 0) { *_srtt = 1000000; /* 1S */ return false; } *_srtt = call->peer->srtt_us >> 3; return true; } EXPORT_SYMBOL(rxrpc_kernel_get_srtt); |
5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the ICMP protocol. * * Version: @(#)icmp.h 1.0.3 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_ICMP_H #define _LINUX_ICMP_H #include <linux/skbuff.h> #include <uapi/linux/icmp.h> #include <uapi/linux/errqueue.h> static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { return (struct icmphdr *)skb_transport_header(skb); } static inline bool icmp_is_err(int type) { switch (type) { case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_REDIRECT: case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: return true; } return false; } void ip_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out, int thlen, int off); #endif /* _LINUX_ICMP_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __IEEE802154_CORE_H #define __IEEE802154_CORE_H #include <net/cfg802154.h> struct cfg802154_registered_device { const struct cfg802154_ops *ops; struct list_head list; /* wpan_phy index, internal only */ int wpan_phy_idx; /* also protected by devlist_mtx */ int opencount; wait_queue_head_t dev_wait; /* protected by RTNL only */ int num_running_ifaces; /* associated wpan interfaces, protected by rtnl or RCU */ struct list_head wpan_dev_list; int devlist_generation, wpan_dev_id; /* must be last because of the way we do wpan_phy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wpan_phy wpan_phy __aligned(NETDEV_ALIGN); }; static inline struct cfg802154_registered_device * wpan_phy_to_rdev(struct wpan_phy *wpan_phy) { BUG_ON(!wpan_phy); return container_of(wpan_phy, struct cfg802154_registered_device, wpan_phy); } extern struct list_head cfg802154_rdev_list; extern int cfg802154_rdev_list_generation; int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, struct net *net); /* free object */ void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx); struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx); #endif /* __IEEE802154_CORE_H */ |
635 12147 12144 12131 560 473 234 561 91 91 91 91 10687 10681 10202 10206 10199 2698 2700 2702 2700 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 | // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/sched/cputime.h> static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) { return per_cpu_ptr(cgrp->rstat_cpu, cpu); } /** * cgroup_rstat_updated - keep track of updated rstat_cpu * @cgrp: target cgroup * @cpu: cpu on which rstat_cpu was updated * * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching * rstat_cpu->updated_children list. See the comment on top of * cgroup_rstat_cpu definition for details. */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); unsigned long flags; /* * Speculative already-on-list test. This may race leading to * temporary inaccuracies, which is fine. * * Because @parent's updated_children is terminated with @parent * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. */ if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) return; raw_spin_lock_irqsave(cpu_lock, flags); /* put @cgrp and all ancestors on the corresponding updated lists */ while (true) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup * is already in the tree, all ancestors are. */ if (rstatc->updated_next) break; /* Root has no parent to link it to, but mark it busy */ if (!parent) { rstatc->updated_next = cgrp; break; } prstatc = cgroup_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; prstatc->updated_children = cgrp; cgrp = parent; } raw_spin_unlock_irqrestore(cpu_lock, flags); } /** * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree * @pos: current position * @root: root of the tree to traversal * @cpu: target cpu * * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts * the traversal and %NULL return indicates the end. During traversal, * each returned cgroup is unlinked from the tree. Must be called with the * matching cgroup_rstat_cpu_lock held. * * The only ordering guarantee is that, for a parent and a child pair * covered by a given traversal, if a child is visited, its parent is * guaranteed to be visited afterwards. */ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; if (pos == root) return NULL; /* * We're gonna walk down to the first leaf and visit/remove it. We * can pick whatever unvisited node as the starting point. */ if (!pos) pos = root; else pos = cgroup_parent(pos); /* walk down to the first leaf */ while (true) { rstatc = cgroup_rstat_cpu(pos, cpu); if (rstatc->updated_children == pos) break; pos = rstatc->updated_children; } /* * Unlink @pos from the tree. As the updated_children list is * singly linked, we have to walk it to find the removal point. * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ if (rstatc->updated_next) { struct cgroup *parent = cgroup_parent(pos); if (parent) { struct cgroup_rstat_cpu *prstatc; struct cgroup **nextp; prstatc = cgroup_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; while (true) { struct cgroup_rstat_cpu *nrstatc; nrstatc = cgroup_rstat_cpu(*nextp, cpu); if (*nextp == pos) break; WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } *nextp = rstatc->updated_next; } rstatc->updated_next = NULL; return pos; } /* only happens for @root */ return NULL; } /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) { int cpu; lockdep_assert_held(&cgroup_rstat_lock); for_each_possible_cpu(cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; raw_spin_lock(cpu_lock); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { struct cgroup_subsys_state *css; cgroup_base_stat_flush(pos, cpu); rcu_read_lock(); list_for_each_entry_rcu(css, &pos->rstat_css_list, rstat_css_node) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } raw_spin_unlock(cpu_lock); /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || spin_needbreak(&cgroup_rstat_lock))) { spin_unlock_irq(&cgroup_rstat_lock); if (!cond_resched()) cpu_relax(); spin_lock_irq(&cgroup_rstat_lock); } } } /** * cgroup_rstat_flush - flush stats in @cgrp's subtree * @cgrp: target cgroup * * Collect all per-cpu stats in @cgrp's subtree into the global counters * and propagate them upwards. After this function returns, all cgroups in * the subtree have up-to-date ->stat. * * This also gets all cgroups in the subtree including @cgrp off the * ->updated_children lists. * * This function may block. */ void cgroup_rstat_flush(struct cgroup *cgrp) { might_sleep(); spin_lock_irq(&cgroup_rstat_lock); cgroup_rstat_flush_locked(cgrp, true); spin_unlock_irq(&cgroup_rstat_lock); } /** * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() * @cgrp: target cgroup * * This function can be called from any context. */ void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) { unsigned long flags; spin_lock_irqsave(&cgroup_rstat_lock, flags); cgroup_rstat_flush_locked(cgrp, false); spin_unlock_irqrestore(&cgroup_rstat_lock, flags); } /** * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold * @cgrp: target cgroup * * Flush stats in @cgrp's subtree and prevent further flushes. Must be * paired with cgroup_rstat_flush_release(). * * This function may block. */ void cgroup_rstat_flush_hold(struct cgroup *cgrp) __acquires(&cgroup_rstat_lock) { might_sleep(); spin_lock_irq(&cgroup_rstat_lock); cgroup_rstat_flush_locked(cgrp, true); } /** * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() */ void cgroup_rstat_flush_release(void) __releases(&cgroup_rstat_lock) { spin_unlock_irq(&cgroup_rstat_lock); } int cgroup_rstat_init(struct cgroup *cgrp) { int cpu; /* the root cgrp has rstat_cpu preallocated */ if (!cgrp->rstat_cpu) { cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); if (!cgrp->rstat_cpu) return -ENOMEM; } /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); rstatc->updated_children = cgrp; u64_stats_init(&rstatc->bsync); } return 0; } void cgroup_rstat_exit(struct cgroup *cgrp) { int cpu; cgroup_rstat_flush(cgrp); /* sanity check */ for_each_possible_cpu(cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || WARN_ON_ONCE(rstatc->updated_next)) return; } free_percpu(cgrp->rstat_cpu); cgrp->rstat_cpu = NULL; } void __init cgroup_rstat_boot(void) { int cpu; for_each_possible_cpu(cpu) raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); } /* * Functions for cgroup basic resource statistics implemented on top of * rstat. */ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime -= src_bstat->cputime.utime; dst_bstat->cputime.stime -= src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_base_stat cur, delta; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ if (!parent) return; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); cur.cputime = rstatc->bstat.cputime; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); /* propagate percpu delta to global */ delta = cur; cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); /* propagate global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); } } static struct cgroup_rstat_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { struct cgroup_rstat_cpu *rstatc; rstatc = get_cpu_ptr(cgrp->rstat_cpu); *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); return rstatc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, struct cgroup_rstat_cpu *rstatc, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatc->bsync, flags); cgroup_rstat_updated(cgrp, smp_processor_id()); put_cpu_ptr(rstatc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { struct cgroup_rstat_cpu *rstatc; unsigned long flags; rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); rstatc->bstat.cputime.sum_exec_runtime += delta_exec; cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup_rstat_cpu *rstatc; unsigned long flags; rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_USER: case CPUTIME_NICE: rstatc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: rstatc->bstat.cputime.stime += delta_exec; break; default: break; } cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); } /* * compute the cputime for the root cgroup by getting the per cpu data * at a global level, then categorizing the fields in a manner consistent * with how it is done by __cgroup_account_cputime_field for each bit of * cpu time attributed to a cgroup. */ static void root_cgroup_cputime(struct task_cputime *cputime) { int i; cputime->stime = 0; cputime->utime = 0; cputime->sum_exec_runtime = 0; for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; u64 user = 0; u64 sys = 0; kcpustat_cpu_fetch(&kcpustat, i); user += cpustat[CPUTIME_USER]; user += cpustat[CPUTIME_NICE]; cputime->utime += user; sys += cpustat[CPUTIME_SYSTEM]; sys += cpustat[CPUTIME_IRQ]; sys += cpustat[CPUTIME_SOFTIRQ]; cputime->stime += sys; cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; } } void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; struct task_cputime cputime; if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); usage = cgrp->bstat.cputime.sum_exec_runtime; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); cgroup_rstat_flush_release(); } else { root_cgroup_cputime(&cputime); usage = cputime.sum_exec_runtime; utime = cputime.utime; stime = cputime.stime; } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); do_div(stime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n", usage, utime, stime); } |
15 15 15 15 15 15 15 15 15 15 15 15 11 4 15 15 15 15 15 15 15 15 15 15 15 15 6 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 20 15 11 4 15 14 15 15 15 35 35 6 6 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 3 15 3 15 15 15 19 20 15 5 19 20 20 20 20 20 20 15 15 3 12 15 3 8 7 15 15 35 35 35 35 32 33 22 2 20 20 20 20 19 19 20 5 15 19 20 6 16 22 22 27 27 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2018-2021 Intel Corporation * * Transmit and frame generation functions. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #include <linux/etherdevice.h> #include <linux/bitmap.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/timekeeping.h> #include <net/net_namespace.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> #include <net/mac80211.h> #include <net/codel.h> #include <net/codel_impl.h> #include <asm/unaligned.h> #include <net/fq_impl.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "led.h" #include "mesh.h" #include "wep.h" #include "wpa.h" #include "wme.h" #include "rate.h" /* misc utils */ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct sk_buff *skb, int group_addr, int next_frag_len) { int rate, mrate, erp, dur, i, shift = 0; struct ieee80211_rate *txrate; struct ieee80211_local *local = tx->local; struct ieee80211_supported_band *sband; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_chanctx_conf *chanctx_conf; u32 rate_flags = 0; /* assume HW handles this */ if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)) return 0; rcu_read_lock(); chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf); if (chanctx_conf) { shift = ieee80211_chandef_get_shift(&chanctx_conf->def); rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def); } rcu_read_unlock(); /* uh huh? */ if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; sband = local->hw.wiphy->bands[info->band]; txrate = &sband->bitrates[tx->rate.idx]; erp = txrate->flags & IEEE80211_RATE_ERP_G; /* device is expected to do this */ if (sband->band == NL80211_BAND_S1GHZ) return 0; /* * data and mgmt (except PS Poll): * - during CFP: 32768 * - during contention period: * if addr1 is group address: 0 * if more fragments = 0 and addr1 is individual address: time to * transmit one ACK plus SIFS * if more fragments = 1 and addr1 is individual address: time to * transmit next fragment plus 2 x ACK plus 3 x SIFS * * IEEE 802.11, 9.6: * - control response frame (CTS or ACK) shall be transmitted using the * same rate as the immediately previous frame in the frame exchange * sequence, if this rate belongs to the PHY mandatory rates, or else * at the highest possible rate belonging to the PHY rates in the * BSSBasicRateSet */ hdr = (struct ieee80211_hdr *)skb->data; if (ieee80211_is_ctl(hdr->frame_control)) { /* TODO: These control frames are not currently sent by * mac80211, but should they be implemented, this function * needs to be updated to support duration field calculation. * * RTS: time needed to transmit pending data/mgmt frame plus * one CTS frame plus one ACK frame plus 3 x SIFS * CTS: duration of immediately previous RTS minus time * required to transmit CTS and its SIFS * ACK: 0 if immediately previous directed data/mgmt had * more=0, with more=1 duration in ACK frame is duration * from previous frame minus time needed to transmit ACK * and its SIFS * PS Poll: BIT(15) | BIT(14) | aid */ return 0; } /* data/mgmt */ if (0 /* FIX: data/mgmt during CFP */) return cpu_to_le16(32768); if (group_addr) /* Group address as the destination - no ACK */ return 0; /* Individual destination address: * IEEE 802.11, Ch. 9.6 (after IEEE 802.11g changes) * CTS and ACK frames shall be transmitted using the highest rate in * basic rate set that is less than or equal to the rate of the * immediately previous frame and that is using the same modulation * (CCK or OFDM). If no basic rate set matches with these requirements, * the highest mandatory rate of the PHY that is less than or equal to * the rate of the previous frame is used. * Mandatory rates for IEEE 802.11g PHY: 1, 2, 5.5, 11, 6, 12, 24 Mbps */ rate = -1; /* use lowest available if everything fails */ mrate = sband->bitrates[0].bitrate; for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *r = &sband->bitrates[i]; if (r->bitrate > txrate->bitrate) break; if ((rate_flags & r->flags) != rate_flags) continue; if (tx->sdata->vif.bss_conf.basic_rates & BIT(i)) rate = DIV_ROUND_UP(r->bitrate, 1 << shift); switch (sband->band) { case NL80211_BAND_2GHZ: { u32 flag; if (tx->sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) flag = IEEE80211_RATE_MANDATORY_G; else flag = IEEE80211_RATE_MANDATORY_B; if (r->flags & flag) mrate = r->bitrate; break; } case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: if (r->flags & IEEE80211_RATE_MANDATORY_A) mrate = r->bitrate; break; case NL80211_BAND_S1GHZ: case NL80211_BAND_60GHZ: /* TODO, for now fall through */ case NUM_NL80211_BANDS: WARN_ON(1); break; } } if (rate == -1) { /* No matching basic rate found; use highest suitable mandatory * PHY rate */ rate = DIV_ROUND_UP(mrate, 1 << shift); } /* Don't calculate ACKs for QoS Frames with NoAck Policy set */ if (ieee80211_is_data_qos(hdr->frame_control) && *(ieee80211_get_qos_ctl(hdr)) & IEEE80211_QOS_CTL_ACK_POLICY_NOACK) dur = 0; else /* Time needed to transmit ACK * (10 bytes + 4-byte FCS = 112 bits) plus SIFS; rounded up * to closest integer */ dur = ieee80211_frame_duration(sband->band, 10, rate, erp, tx->sdata->vif.bss_conf.use_short_preamble, shift); if (next_frag_len) { /* Frame is fragmented: duration increases with time needed to * transmit next fragment plus ACK and 2 x SIFS. */ dur *= 2; /* ACK + SIFS */ /* next fragment */ dur += ieee80211_frame_duration(sband->band, next_frag_len, txrate->bitrate, erp, tx->sdata->vif.bss_conf.use_short_preamble, shift); } return cpu_to_le16(dur); } /* tx handlers */ static ieee80211_tx_result debug_noinline ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) { struct ieee80211_local *local = tx->local; struct ieee80211_if_managed *ifmgd; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); /* driver doesn't support power save */ if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return TX_CONTINUE; /* hardware does dynamic power save */ if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) return TX_CONTINUE; /* dynamic power save disabled */ if (local->hw.conf.dynamic_ps_timeout <= 0) return TX_CONTINUE; /* we are scanning, don't enable power save */ if (local->scanning) return TX_CONTINUE; if (!local->ps_sdata) return TX_CONTINUE; /* No point if we're going to suspend */ if (local->quiescing) return TX_CONTINUE; /* dynamic ps is supported only in managed mode */ if (tx->sdata->vif.type != NL80211_IFTYPE_STATION) return TX_CONTINUE; if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) return TX_CONTINUE; ifmgd = &tx->sdata->u.mgd; /* * Don't wakeup from power save if u-apsd is enabled, voip ac has * u-apsd enabled and the frame is in voip class. This effectively * means that even if all access categories have u-apsd enabled, in * practise u-apsd is only used with the voip ac. This is a * workaround for the case when received voip class packets do not * have correct qos tag for some reason, due the network or the * peer application. * * Note: ifmgd->uapsd_queues access is racy here. If the value is * changed via debugfs, user needs to reassociate manually to have * everything in sync. */ if ((ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) && (ifmgd->uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) && skb_get_queue_mapping(tx->skb) == IEEE80211_AC_VO) return TX_CONTINUE; if (local->hw.conf.flags & IEEE80211_CONF_PS) { ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_PS, false); ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; ieee80211_queue_work(&local->hw, &local->dynamic_ps_disable_work); } /* Don't restart the timer if we're not disassociated */ if (!ifmgd->associated) return TX_CONTINUE; mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); bool assoc = false; if (unlikely(info->flags & IEEE80211_TX_CTL_INJECTED)) return TX_CONTINUE; if (unlikely(test_bit(SCAN_SW_SCANNING, &tx->local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &tx->sdata->state) && !ieee80211_is_probe_req(hdr->frame_control) && !ieee80211_is_any_nullfunc(hdr->frame_control)) /* * When software scanning only nullfunc frames (to notify * the sleep state to the AP) and probe requests (for the * active scan) are allowed, all other frames should not be * sent and we should not get here, but if we do * nonetheless, drop them to avoid sending them * off-channel. See the link below and * ieee80211_start_scan() for more. * * http://article.gmane.org/gmane.linux.kernel.wireless.general/30089 */ return TX_DROP; if (tx->sdata->vif.type == NL80211_IFTYPE_OCB) return TX_CONTINUE; if (tx->flags & IEEE80211_TX_PS_BUFFERED) return TX_CONTINUE; if (tx->sta) assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC); if (likely(tx->flags & IEEE80211_TX_UNICAST)) { if (unlikely(!assoc && ieee80211_is_data(hdr->frame_control))) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG sdata_info(tx->sdata, "dropped data frame to not associated station %pM\n", hdr->addr1); #endif I802_DEBUG_INC(tx->local->tx_handlers_drop_not_assoc); return TX_DROP; } } else if (unlikely(ieee80211_is_data(hdr->frame_control) && ieee80211_vif_get_num_mcast_if(tx->sdata) == 0)) { /* * No associated STAs - no need to send multicast * frames. */ return TX_DROP; } return TX_CONTINUE; } /* This function is called whenever the AP is about to exceed the maximum limit * of buffered frames for power saving STAs. This situation should not really * happen often during normal operation, so dropping the oldest buffered packet * from each queue should be OK to make some room for new frames. */ static void purge_old_ps_buffers(struct ieee80211_local *local) { int total = 0, purged = 0; struct sk_buff *skb; struct ieee80211_sub_if_data *sdata; struct sta_info *sta; list_for_each_entry_rcu(sdata, &local->interfaces, list) { struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->u.ap.ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else continue; skb = skb_dequeue(&ps->bc_buf); if (skb) { purged++; ieee80211_free_txskb(&local->hw, skb); } total += skb_queue_len(&ps->bc_buf); } /* * Drop one frame from each station from the lowest-priority * AC that has frames at all. */ list_for_each_entry_rcu(sta, &local->sta_list, list) { int ac; for (ac = IEEE80211_AC_BK; ac >= IEEE80211_AC_VO; ac--) { skb = skb_dequeue(&sta->ps_tx_buf[ac]); total += skb_queue_len(&sta->ps_tx_buf[ac]); if (skb) { purged++; ieee80211_free_txskb(&local->hw, skb); break; } } } local->total_ps_buffered = total; ps_dbg_hw(&local->hw, "PS buffers full - purged %d frames\n", purged); } static ieee80211_tx_result ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ps_data *ps; /* * broadcast/multicast frame * * If any of the associated/peer stations is in power save mode, * the frame is buffered to be sent after DTIM beacon frame. * This is done either by the hardware or us. */ /* powersaving STAs currently only in AP/VLAN/mesh mode */ if (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (!tx->sdata->bss) return TX_CONTINUE; ps = &tx->sdata->bss->ps; } else if (ieee80211_vif_is_mesh(&tx->sdata->vif)) { ps = &tx->sdata->u.mesh.ps; } else { return TX_CONTINUE; } /* no buffering for ordered frames */ if (ieee80211_has_order(hdr->frame_control)) return TX_CONTINUE; if (ieee80211_is_probe_req(hdr->frame_control)) return TX_CONTINUE; if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL)) info->hw_queue = tx->sdata->vif.cab_queue; /* no stations in PS mode and no buffered packets */ if (!atomic_read(&ps->num_sta_ps) && skb_queue_empty(&ps->bc_buf)) return TX_CONTINUE; info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM; /* device releases frame after DTIM beacon */ if (!ieee80211_hw_check(&tx->local->hw, HOST_BROADCAST_PS_BUFFERING)) return TX_CONTINUE; /* buffered in mac80211 */ if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER) purge_old_ps_buffers(tx->local); if (skb_queue_len(&ps->bc_buf) >= AP_MAX_BC_BUFFER) { ps_dbg(tx->sdata, "BC TX buffer full - dropping the oldest frame\n"); ieee80211_free_txskb(&tx->local->hw, skb_dequeue(&ps->bc_buf)); } else tx->local->total_ps_buffered++; skb_queue_tail(&ps->bc_buf, tx->skb); return TX_QUEUED; } static int ieee80211_use_mfp(__le16 fc, struct sta_info *sta, struct sk_buff *skb) { if (!ieee80211_is_mgmt(fc)) return 0; if (sta == NULL || !test_sta_flag(sta, WLAN_STA_MFP)) return 0; if (!ieee80211_is_robust_mgmt_frame(skb)) return 0; return 1; } static ieee80211_tx_result ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) { struct sta_info *sta = tx->sta; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; struct ieee80211_local *local = tx->local; if (unlikely(!sta)) return TX_CONTINUE; if (unlikely((test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER)) && !(info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER))) { int ac = skb_get_queue_mapping(tx->skb); if (ieee80211_is_mgmt(hdr->frame_control) && !ieee80211_is_bufferable_mmpdu(hdr->frame_control)) { info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; return TX_CONTINUE; } ps_dbg(sta->sdata, "STA %pM aid %d: PS buffer for AC %d\n", sta->sta.addr, sta->sta.aid, ac); if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER) purge_old_ps_buffers(tx->local); /* sync with ieee80211_sta_ps_deliver_wakeup */ spin_lock(&sta->ps_lock); /* * STA woke up the meantime and all the frames on ps_tx_buf have * been queued to pending queue. No reordering can happen, go * ahead and Tx the packet. */ if (!test_sta_flag(sta, WLAN_STA_PS_STA) && !test_sta_flag(sta, WLAN_STA_PS_DRIVER) && !test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { spin_unlock(&sta->ps_lock); return TX_CONTINUE; } if (skb_queue_len(&sta->ps_tx_buf[ac]) >= STA_MAX_TX_BUFFER) { struct sk_buff *old = skb_dequeue(&sta->ps_tx_buf[ac]); ps_dbg(tx->sdata, "STA %pM TX buffer for AC %d full - dropping oldest frame\n", sta->sta.addr, ac); ieee80211_free_txskb(&local->hw, old); } else tx->local->total_ps_buffered++; info->control.jiffies = jiffies; info->control.vif = &tx->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; skb_queue_tail(&sta->ps_tx_buf[ac], tx->skb); spin_unlock(&sta->ps_lock); if (!timer_pending(&local->sta_cleanup)) mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); /* * We queued up some frames, so the TIM bit might * need to be set, recalculate it. */ sta_info_recalc_tim(sta); return TX_QUEUED; } else if (unlikely(test_sta_flag(sta, WLAN_STA_PS_STA))) { ps_dbg(tx->sdata, "STA %pM in PS mode, but polling/in SP -> send frame\n", sta->sta.addr); } return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx) { if (unlikely(tx->flags & IEEE80211_TX_PS_BUFFERED)) return TX_CONTINUE; if (tx->flags & IEEE80211_TX_UNICAST) return ieee80211_tx_h_unicast_ps_buf(tx); else return ieee80211_tx_h_multicast_ps_buf(tx); } static ieee80211_tx_result debug_noinline ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol)) { if (tx->sdata->control_port_no_encrypt) info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; info->flags |= IEEE80211_TX_CTL_USE_MINRATE; } return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) { struct ieee80211_key *key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) { tx->key = NULL; return TX_CONTINUE; } if (tx->sta && (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx]))) tx->key = key; else if (ieee80211_is_group_privacy_action(tx->skb) && (key = rcu_dereference(tx->sdata->default_multicast_key))) tx->key = key; else if (ieee80211_is_mgmt(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && ieee80211_is_robust_mgmt_frame(tx->skb) && (key = rcu_dereference(tx->sdata->default_mgmt_key))) tx->key = key; else if (is_multicast_ether_addr(hdr->addr1) && (key = rcu_dereference(tx->sdata->default_multicast_key))) tx->key = key; else if (!is_multicast_ether_addr(hdr->addr1) && (key = rcu_dereference(tx->sdata->default_unicast_key))) tx->key = key; else tx->key = NULL; if (tx->key) { bool skip_hw = false; /* TODO: add threshold stuff again */ switch (tx->key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: case WLAN_CIPHER_SUITE_TKIP: if (!ieee80211_is_data_present(hdr->frame_control)) tx->key = NULL; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: if (!ieee80211_is_data_present(hdr->frame_control) && !ieee80211_use_mfp(hdr->frame_control, tx->sta, tx->skb) && !ieee80211_is_group_privacy_action(tx->skb)) tx->key = NULL; else skip_hw = (tx->key->conf.flags & IEEE80211_KEY_FLAG_SW_MGMT_TX) && ieee80211_is_mgmt(hdr->frame_control); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (!ieee80211_is_mgmt(hdr->frame_control)) tx->key = NULL; break; } if (unlikely(tx->key && tx->key->flags & KEY_FLAG_TAINTED && !ieee80211_is_deauth(hdr->frame_control)) && tx->skb->protocol != tx->sdata->control_port_protocol) return TX_DROP; if (!skip_hw && tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) info->control.hw_key = &tx->key->conf; } else if (ieee80211_is_data_present(hdr->frame_control) && tx->sta && test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) { return TX_DROP; } return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (void *)tx->skb->data; struct ieee80211_supported_band *sband; u32 len; struct ieee80211_tx_rate_control txrc; struct ieee80211_sta_rates *ratetbl = NULL; bool encap = info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP; bool assoc = false; memset(&txrc, 0, sizeof(txrc)); sband = tx->local->hw.wiphy->bands[info->band]; len = min_t(u32, tx->skb->len + FCS_LEN, tx->local->hw.wiphy->frag_threshold); /* set up the tx rate control struct we give the RC algo */ txrc.hw = &tx->local->hw; txrc.sband = sband; txrc.bss_conf = &tx->sdata->vif.bss_conf; txrc.skb = tx->skb; txrc.reported_rate.idx = -1; txrc.rate_idx_mask = tx->sdata->rc_rateidx_mask[info->band]; if (tx->sdata->rc_has_mcs_mask[info->band]) txrc.rate_idx_mcs_mask = tx->sdata->rc_rateidx_mcs_mask[info->band]; txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT || tx->sdata->vif.type == NL80211_IFTYPE_ADHOC || tx->sdata->vif.type == NL80211_IFTYPE_OCB); /* set up RTS protection if desired */ if (len > tx->local->hw.wiphy->rts_threshold) { txrc.rts = true; } info->control.use_rts = txrc.rts; info->control.use_cts_prot = tx->sdata->vif.bss_conf.use_cts_prot; /* * Use short preamble if the BSS can handle it, but not for * management frames unless we know the receiver can handle * that -- the management frame might be to a station that * just wants a probe response. */ if (tx->sdata->vif.bss_conf.use_short_preamble && (ieee80211_is_tx_data(tx->skb) || (tx->sta && test_sta_flag(tx->sta, WLAN_STA_SHORT_PREAMBLE)))) txrc.short_preamble = true; info->control.short_preamble = txrc.short_preamble; /* don't ask rate control when rate already injected via radiotap */ if (info->control.flags & IEEE80211_TX_CTRL_RATE_INJECT) return TX_CONTINUE; if (tx->sta) assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC); /* * Lets not bother rate control if we're associated and cannot * talk to the sta. This should not happen. */ if (WARN(test_bit(SCAN_SW_SCANNING, &tx->local->scanning) && assoc && !rate_usable_index_exists(sband, &tx->sta->sta), "%s: Dropped data frame as no usable bitrate found while " "scanning and associated. Target station: " "%pM on %d GHz band\n", tx->sdata->name, encap ? ((struct ethhdr *)hdr)->h_dest : hdr->addr1, info->band ? 5 : 2)) return TX_DROP; /* * If we're associated with the sta at this point we know we can at * least send the frame at the lowest bit rate. */ rate_control_get_rate(tx->sdata, tx->sta, &txrc); if (tx->sta && !info->control.skip_table) ratetbl = rcu_dereference(tx->sta->sta.rates); if (unlikely(info->control.rates[0].idx < 0)) { if (ratetbl) { struct ieee80211_tx_rate rate = { .idx = ratetbl->rate[0].idx, .flags = ratetbl->rate[0].flags, .count = ratetbl->rate[0].count }; if (ratetbl->rate[0].idx < 0) return TX_DROP; tx->rate = rate; } else { return TX_DROP; } } else { tx->rate = info->control.rates[0]; } if (txrc.reported_rate.idx < 0) { txrc.reported_rate = tx->rate; if (tx->sta && ieee80211_is_tx_data(tx->skb)) tx->sta->tx_stats.last_rate = txrc.reported_rate; } else if (tx->sta) tx->sta->tx_stats.last_rate = txrc.reported_rate; if (ratetbl) return TX_CONTINUE; if (unlikely(!info->control.rates[0].count)) info->control.rates[0].count = 1; if (WARN_ON_ONCE((info->control.rates[0].count > 1) && (info->flags & IEEE80211_TX_CTL_NO_ACK))) info->control.rates[0].count = 1; return TX_CONTINUE; } static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid) { u16 *seq = &sta->tid_seq[tid]; __le16 ret = cpu_to_le16(*seq); /* Increase the sequence number. */ *seq = (*seq + 0x10) & IEEE80211_SCTL_SEQ; return ret; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; int tid; /* * Packet injection may want to control the sequence * number, if we have no matching interface then we * neither assign one ourselves nor ask the driver to. */ if (unlikely(info->control.vif->type == NL80211_IFTYPE_MONITOR)) return TX_CONTINUE; if (unlikely(ieee80211_is_ctl(hdr->frame_control))) return TX_CONTINUE; if (ieee80211_hdrlen(hdr->frame_control) < 24) return TX_CONTINUE; if (ieee80211_is_qos_nullfunc(hdr->frame_control)) return TX_CONTINUE; if (info->control.flags & IEEE80211_TX_CTRL_NO_SEQNO) return TX_CONTINUE; /* * Anything but QoS data that has a sequence number field * (is long enough) gets a sequence number from the global * counter. QoS data frames with a multicast destination * also use the global counter (802.11-2012 9.3.2.10). */ if (!ieee80211_is_data_qos(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) { /* driver should assign sequence number */ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; /* for pure STA mode without beacons, we can do it */ hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number); tx->sdata->sequence_number += 0x10; if (tx->sta) tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++; return TX_CONTINUE; } /* * This should be true for injected/management frames only, for * management frames we have set the IEEE80211_TX_CTL_ASSIGN_SEQ * above since they are not QoS-data frames. */ if (!tx->sta) return TX_CONTINUE; /* include per-STA, per-TID sequence counter */ tid = ieee80211_get_tid(hdr); tx->sta->tx_stats.msdu[tid]++; hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); return TX_CONTINUE; } static int ieee80211_fragment(struct ieee80211_tx_data *tx, struct sk_buff *skb, int hdrlen, int frag_threshold) { struct ieee80211_local *local = tx->local; struct ieee80211_tx_info *info; struct sk_buff *tmp; int per_fragm = frag_threshold - hdrlen - FCS_LEN; int pos = hdrlen + per_fragm; int rem = skb->len - hdrlen - per_fragm; if (WARN_ON(rem < 0)) return -EINVAL; /* first fragment was already added to queue by caller */ while (rem) { int fraglen = per_fragm; if (fraglen > rem) fraglen = rem; rem -= fraglen; tmp = dev_alloc_skb(local->tx_headroom + frag_threshold + tx->sdata->encrypt_headroom + IEEE80211_ENCRYPT_TAILROOM); if (!tmp) return -ENOMEM; __skb_queue_tail(&tx->skbs, tmp); skb_reserve(tmp, local->tx_headroom + tx->sdata->encrypt_headroom); /* copy control information */ memcpy(tmp->cb, skb->cb, sizeof(tmp->cb)); info = IEEE80211_SKB_CB(tmp); info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_FIRST_FRAGMENT); if (rem) info->flags |= IEEE80211_TX_CTL_MORE_FRAMES; skb_copy_queue_mapping(tmp, skb); tmp->priority = skb->priority; tmp->dev = skb->dev; /* copy header and data */ skb_put_data(tmp, skb->data, hdrlen); skb_put_data(tmp, skb->data + pos, fraglen); pos += fraglen; } /* adjust first fragment's length */ skb_trim(skb, hdrlen + per_fragm); return 0; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx) { struct sk_buff *skb = tx->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; int frag_threshold = tx->local->hw.wiphy->frag_threshold; int hdrlen; int fragnum; /* no matter what happens, tx->skb moves to tx->skbs */ __skb_queue_tail(&tx->skbs, skb); tx->skb = NULL; if (info->flags & IEEE80211_TX_CTL_DONTFRAG) return TX_CONTINUE; if (ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) return TX_CONTINUE; /* * Warn when submitting a fragmented A-MPDU frame and drop it. * This scenario is handled in ieee80211_tx_prepare but extra * caution taken here as fragmented ampdu may cause Tx stop. */ if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU)) return TX_DROP; hdrlen = ieee80211_hdrlen(hdr->frame_control); /* internal error, why isn't DONTFRAG set? */ if (WARN_ON(skb->len + FCS_LEN <= frag_threshold)) return TX_DROP; /* * Now fragment the frame. This will allocate all the fragments and * chain them (using skb as the first fragment) to skb->next. * During transmission, we will remove the successfully transmitted * fragments from this list. When the low-level driver rejects one * of the fragments then we will simply pretend to accept the skb * but store it away as pending. */ if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold)) return TX_DROP; /* update duration/seq/flags of fragments */ fragnum = 0; skb_queue_walk(&tx->skbs, skb) { const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS); hdr = (void *)skb->data; info = IEEE80211_SKB_CB(skb); if (!skb_queue_is_last(&tx->skbs, skb)) { hdr->frame_control |= morefrags; /* * No multi-rate retries for fragmented frames, that * would completely throw off the NAV at other STAs. */ info->control.rates[1].idx = -1; info->control.rates[2].idx = -1; info->control.rates[3].idx = -1; BUILD_BUG_ON(IEEE80211_TX_MAX_RATES != 4); info->flags &= ~IEEE80211_TX_CTL_RATE_CTRL_PROBE; } else { hdr->frame_control &= ~morefrags; } hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG); fragnum++; } return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) { struct sk_buff *skb; int ac = -1; if (!tx->sta) return TX_CONTINUE; skb_queue_walk(&tx->skbs, skb) { ac = skb_get_queue_mapping(skb); tx->sta->tx_stats.bytes[ac] += skb->len; } if (ac >= 0) tx->sta->tx_stats.packets[ac]++; return TX_CONTINUE; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) { if (!tx->key) return TX_CONTINUE; switch (tx->key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: return ieee80211_crypto_wep_encrypt(tx); case WLAN_CIPHER_SUITE_TKIP: return ieee80211_crypto_tkip_encrypt(tx); case WLAN_CIPHER_SUITE_CCMP: return ieee80211_crypto_ccmp_encrypt( tx, IEEE80211_CCMP_MIC_LEN); case WLAN_CIPHER_SUITE_CCMP_256: return ieee80211_crypto_ccmp_encrypt( tx, IEEE80211_CCMP_256_MIC_LEN); case WLAN_CIPHER_SUITE_AES_CMAC: return ieee80211_crypto_aes_cmac_encrypt(tx); case WLAN_CIPHER_SUITE_BIP_CMAC_256: return ieee80211_crypto_aes_cmac_256_encrypt(tx); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return ieee80211_crypto_aes_gmac_encrypt(tx); case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: return ieee80211_crypto_gcmp_encrypt(tx); default: return ieee80211_crypto_hw_encrypt(tx); } return TX_DROP; } static ieee80211_tx_result debug_noinline ieee80211_tx_h_calculate_duration(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_hdr *hdr; int next_len; bool group_addr; skb_queue_walk(&tx->skbs, skb) { hdr = (void *) skb->data; if (unlikely(ieee80211_is_pspoll(hdr->frame_control))) break; /* must not overwrite AID */ if (!skb_queue_is_last(&tx->skbs, skb)) { struct sk_buff *next = skb_queue_next(&tx->skbs, skb); next_len = next->len; } else next_len = 0; group_addr = is_multicast_ether_addr(hdr->addr1); hdr->duration_id = ieee80211_duration(tx, skb, group_addr, next_len); } return TX_CONTINUE; } /* actual transmit path */ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, struct sk_buff *skb, struct ieee80211_tx_info *info, struct tid_ampdu_tx *tid_tx, int tid) { bool queued = false; bool reset_agg_timer = false; struct sk_buff *purge_skb = NULL; if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { info->flags |= IEEE80211_TX_CTL_AMPDU; reset_agg_timer = true; } else if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) { /* * nothing -- this aggregation session is being started * but that might still fail with the driver */ } else if (!tx->sta->sta.txq[tid]) { spin_lock(&tx->sta->lock); /* * Need to re-check now, because we may get here * * 1) in the window during which the setup is actually * already done, but not marked yet because not all * packets are spliced over to the driver pending * queue yet -- if this happened we acquire the lock * either before or after the splice happens, but * need to recheck which of these cases happened. * * 2) during session teardown, if the OPERATIONAL bit * was cleared due to the teardown but the pointer * hasn't been assigned NULL yet (or we loaded it * before it was assigned) -- in this case it may * now be NULL which means we should just let the * packet pass through because splicing the frames * back is already done. */ tid_tx = rcu_dereference_protected_tid_tx(tx->sta, tid); if (!tid_tx) { /* do nothing, let packet pass through */ } else if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { info->flags |= IEEE80211_TX_CTL_AMPDU; reset_agg_timer = true; } else { queued = true; if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) { clear_sta_flag(tx->sta, WLAN_STA_SP); ps_dbg(tx->sta->sdata, "STA %pM aid %d: SP frame queued, close the SP w/o telling the peer\n", tx->sta->sta.addr, tx->sta->sta.aid); } info->control.vif = &tx->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; __skb_queue_tail(&tid_tx->pending, skb); if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER) purge_skb = __skb_dequeue(&tid_tx->pending); } spin_unlock(&tx->sta->lock); if (purge_skb) ieee80211_free_txskb(&tx->local->hw, purge_skb); } /* reset session timer */ if (reset_agg_timer) tid_tx->last_tx = jiffies; return queued; } static void ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct rate_control_ref *ref = sdata->local->rate_ctrl; u16 tid; if (!ref || !(ref->ops->capa & RATE_CTRL_CAPA_AMPDU_TRIGGER)) return; if (!sta || !sta->sta.ht_cap.ht_supported || !sta->sta.wme || skb_get_queue_mapping(skb) == IEEE80211_AC_VO || skb->protocol == sdata->control_port_protocol) return; tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; if (likely(sta->ampdu_mlme.tid_tx[tid])) return; ieee80211_start_tx_ba_session(&sta->sta, tid, 0); } /* * initialises @tx * pass %NULL for the station if unknown, a valid pointer if known * or an ERR_PTR() if the station is known not to exist */ static ieee80211_tx_result ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_data *tx, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); bool aggr_check = false; int tid; memset(tx, 0, sizeof(*tx)); tx->skb = skb; tx->local = local; tx->sdata = sdata; __skb_queue_head_init(&tx->skbs); /* * If this flag is set to true anywhere, and we get here, * we are doing the needed processing, so remove the flag * now. */ info->control.flags &= ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING; hdr = (struct ieee80211_hdr *) skb->data; if (likely(sta)) { if (!IS_ERR(sta)) tx->sta = sta; } else { if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { tx->sta = rcu_dereference(sdata->u.vlan.sta); if (!tx->sta && sdata->wdev.use_4addr) return TX_DROP; } else if (tx->sdata->control_port_protocol == tx->skb->protocol) { tx->sta = sta_info_get_bss(sdata, hdr->addr1); } if (!tx->sta && !is_multicast_ether_addr(hdr->addr1)) { tx->sta = sta_info_get(sdata, hdr->addr1); aggr_check = true; } } if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) && !ieee80211_is_qos_nullfunc(hdr->frame_control) && ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) && !ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) { struct tid_ampdu_tx *tid_tx; tid = ieee80211_get_tid(hdr); tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx && aggr_check) { ieee80211_aggr_check(sdata, tx->sta, skb); tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); } if (tid_tx) { bool queued; queued = ieee80211_tx_prep_agg(tx, skb, info, tid_tx, tid); if (unlikely(queued)) return TX_QUEUED; } } if (is_multicast_ether_addr(hdr->addr1)) { tx->flags &= ~IEEE80211_TX_UNICAST; info->flags |= IEEE80211_TX_CTL_NO_ACK; } else tx->flags |= IEEE80211_TX_UNICAST; if (!(info->flags & IEEE80211_TX_CTL_DONTFRAG)) { if (!(tx->flags & IEEE80211_TX_UNICAST) || skb->len + FCS_LEN <= local->hw.wiphy->frag_threshold || info->flags & IEEE80211_TX_CTL_AMPDU) info->flags |= IEEE80211_TX_CTL_DONTFRAG; } if (!tx->sta) info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT)) { info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; ieee80211_check_fast_xmit(tx->sta); } info->flags |= IEEE80211_TX_CTL_FIRST_FRAGMENT; return TX_CONTINUE; } static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, struct ieee80211_vif *vif, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_txq *txq = NULL; if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) return NULL; if (!(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) && unlikely(!ieee80211_is_data_present(hdr->frame_control))) { if ((!ieee80211_is_mgmt(hdr->frame_control) || ieee80211_is_bufferable_mmpdu(hdr->frame_control) || vif->type == NL80211_IFTYPE_STATION) && sta && sta->uploaded) { /* * This will be NULL if the driver didn't set the * opt-in hardware flag. */ txq = sta->sta.txq[IEEE80211_NUM_TIDS]; } } else if (sta) { u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; if (!sta->uploaded) return NULL; txq = sta->sta.txq[tid]; } else if (vif) { txq = vif->txq; } if (!txq) return NULL; return to_txq_info(txq); } static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) { IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); } static u32 codel_skb_len_func(const struct sk_buff *skb) { return skb->len; } static codel_time_t codel_skb_time_func(const struct sk_buff *skb) { const struct ieee80211_tx_info *info; info = (const struct ieee80211_tx_info *)skb->cb; return info->control.enqueue_time; } static struct sk_buff *codel_dequeue_func(struct codel_vars *cvars, void *ctx) { struct ieee80211_local *local; struct txq_info *txqi; struct fq *fq; struct fq_flow *flow; txqi = ctx; local = vif_to_sdata(txqi->txq.vif)->local; fq = &local->fq; if (cvars == &txqi->def_cvars) flow = &txqi->tin.default_flow; else flow = &fq->flows[cvars - local->cvars]; return fq_flow_dequeue(fq, flow); } static void codel_drop_func(struct sk_buff *skb, void *ctx) { struct ieee80211_local *local; struct ieee80211_hw *hw; struct txq_info *txqi; txqi = ctx; local = vif_to_sdata(txqi->txq.vif)->local; hw = &local->hw; ieee80211_free_txskb(hw, skb); } static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow) { struct ieee80211_local *local; struct txq_info *txqi; struct codel_vars *cvars; struct codel_params *cparams; struct codel_stats *cstats; local = container_of(fq, struct ieee80211_local, fq); txqi = container_of(tin, struct txq_info, tin); cstats = &txqi->cstats; if (txqi->txq.sta) { struct sta_info *sta = container_of(txqi->txq.sta, struct sta_info, sta); cparams = &sta->cparams; } else { cparams = &local->cparams; } if (flow == &tin->default_flow) cvars = &txqi->def_cvars; else cvars = &local->cvars[flow - fq->flows]; return codel_dequeue(txqi, &flow->backlog, cparams, cvars, cstats, codel_skb_len_func, codel_skb_time_func, codel_drop_func, codel_dequeue_func); } static void fq_skb_free_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow, struct sk_buff *skb) { struct ieee80211_local *local; local = container_of(fq, struct ieee80211_local, fq); ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_txq_enqueue(struct ieee80211_local *local, struct txq_info *txqi, struct sk_buff *skb) { struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; u32 flow_idx = fq_flow_idx(fq, skb); ieee80211_set_skb_enqueue_time(skb); spin_lock_bh(&fq->lock); /* * For management frames, don't really apply codel etc., * we don't want to apply any shaping or anything we just * want to simplify the driver API by having them on the * txqi. */ if (unlikely(txqi->txq.tid == IEEE80211_NUM_TIDS)) { IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; __skb_queue_tail(&txqi->frags, skb); } else { fq_tin_enqueue(fq, tin, flow_idx, skb, fq_skb_free_func); } spin_unlock_bh(&fq->lock); } static bool fq_vlan_filter_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow, struct sk_buff *skb, void *data) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); return info->control.vif == data; } void ieee80211_txq_remove_vlan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct fq *fq = &local->fq; struct txq_info *txqi; struct fq_tin *tin; struct ieee80211_sub_if_data *ap; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP_VLAN)) return; ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); if (!ap->vif.txq) return; txqi = to_txq_info(ap->vif.txq); tin = &txqi->tin; spin_lock_bh(&fq->lock); fq_tin_filter(fq, tin, fq_vlan_filter_func, &sdata->vif, fq_skb_free_func); spin_unlock_bh(&fq->lock); } void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct txq_info *txqi, int tid) { fq_tin_init(&txqi->tin); codel_vars_init(&txqi->def_cvars); codel_stats_init(&txqi->cstats); __skb_queue_head_init(&txqi->frags); RB_CLEAR_NODE(&txqi->schedule_order); txqi->txq.vif = &sdata->vif; if (!sta) { sdata->vif.txq = &txqi->txq; txqi->txq.tid = 0; txqi->txq.ac = IEEE80211_AC_BE; return; } if (tid == IEEE80211_NUM_TIDS) { if (sdata->vif.type == NL80211_IFTYPE_STATION) { /* Drivers need to opt in to the management MPDU TXQ */ if (!ieee80211_hw_check(&sdata->local->hw, STA_MMPDU_TXQ)) return; } else if (!ieee80211_hw_check(&sdata->local->hw, BUFF_MMPDU_TXQ)) { /* Drivers need to opt in to the bufferable MMPDU TXQ */ return; } txqi->txq.ac = IEEE80211_AC_VO; } else { txqi->txq.ac = ieee80211_ac_from_tid(tid); } txqi->txq.sta = &sta->sta; txqi->txq.tid = tid; sta->sta.txq[tid] = &txqi->txq; } void ieee80211_txq_purge(struct ieee80211_local *local, struct txq_info *txqi) { struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; spin_lock_bh(&fq->lock); fq_tin_reset(fq, tin, fq_skb_free_func); ieee80211_purge_tx_queue(&local->hw, &txqi->frags); spin_unlock_bh(&fq->lock); ieee80211_unschedule_txq(&local->hw, &txqi->txq, true); } void ieee80211_txq_set_params(struct ieee80211_local *local) { if (local->hw.wiphy->txq_limit) local->fq.limit = local->hw.wiphy->txq_limit; else local->hw.wiphy->txq_limit = local->fq.limit; if (local->hw.wiphy->txq_memory_limit) local->fq.memory_limit = local->hw.wiphy->txq_memory_limit; else local->hw.wiphy->txq_memory_limit = local->fq.memory_limit; if (local->hw.wiphy->txq_quantum) local->fq.quantum = local->hw.wiphy->txq_quantum; else local->hw.wiphy->txq_quantum = local->fq.quantum; } int ieee80211_txq_setup_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; int ret; int i; bool supp_vht = false; enum nl80211_band band; if (!local->ops->wake_tx_queue) return 0; ret = fq_init(fq, 4096); if (ret) return ret; /* * If the hardware doesn't support VHT, it is safe to limit the maximum * queue size. 4 Mbytes is 64 max-size aggregates in 802.11n. */ for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[band]; if (!sband) continue; supp_vht = supp_vht || sband->vht_cap.vht_supported; } if (!supp_vht) fq->memory_limit = 4 << 20; /* 4 Mbytes */ codel_params_init(&local->cparams); local->cparams.interval = MS2TIME(100); local->cparams.target = MS2TIME(20); local->cparams.ecn = true; local->cvars = kcalloc(fq->flows_cnt, sizeof(local->cvars[0]), GFP_KERNEL); if (!local->cvars) { spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); spin_unlock_bh(&fq->lock); return -ENOMEM; } for (i = 0; i < fq->flows_cnt; i++) codel_vars_init(&local->cvars[i]); ieee80211_txq_set_params(local); return 0; } void ieee80211_txq_teardown_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; if (!local->ops->wake_tx_queue) return; kfree(local->cvars); local->cvars = NULL; spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); spin_unlock_bh(&fq->lock); } static bool ieee80211_queue_skb(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_vif *vif; struct txq_info *txqi; if (!local->ops->wake_tx_queue || sdata->vif.type == NL80211_IFTYPE_MONITOR) return false; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); vif = &sdata->vif; txqi = ieee80211_get_txq(local, vif, sta, skb); if (!txqi) return false; ieee80211_txq_enqueue(local, txqi, skb); schedule_and_wake_txq(local, txqi); return true; } static bool ieee80211_tx_frags(struct ieee80211_local *local, struct ieee80211_vif *vif, struct sta_info *sta, struct sk_buff_head *skbs, bool txpending) { struct ieee80211_tx_control control = {}; struct sk_buff *skb, *tmp; unsigned long flags; skb_queue_walk_safe(skbs, skb, tmp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int q = info->hw_queue; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG if (WARN_ON_ONCE(q >= local->hw.queues)) { __skb_unlink(skb, skbs); ieee80211_free_txskb(&local->hw, skb); continue; } #endif spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) { if (local->queue_stop_reasons[q] & ~BIT(IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL)) { /* * Drop off-channel frames if queues * are stopped for any reason other * than off-channel operation. Never * queue them. */ spin_unlock_irqrestore( &local->queue_stop_reason_lock, flags); ieee80211_purge_tx_queue(&local->hw, skbs); return true; } } else { /* * Since queue is stopped, queue up frames for * later transmission from the tx-pending * tasklet when the queue is woken again. */ if (txpending) skb_queue_splice_init(skbs, &local->pending[q]); else skb_queue_splice_tail_init(skbs, &local->pending[q]); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return false; } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; control.sta = sta ? &sta->sta : NULL; __skb_unlink(skb, skbs); drv_tx(local, &control, skb); } return true; } /* * Returns false if the frame couldn't be transmitted but was queued instead. */ static bool __ieee80211_tx(struct ieee80211_local *local, struct sk_buff_head *skbs, struct sta_info *sta, bool txpending) { struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata; struct ieee80211_vif *vif; struct sk_buff *skb; bool result; if (WARN_ON(skb_queue_empty(skbs))) return true; skb = skb_peek(skbs); info = IEEE80211_SKB_CB(skb); sdata = vif_to_sdata(info->control.vif); if (sta && !sta->uploaded) sta = NULL; switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { vif = &sdata->vif; break; } sdata = rcu_dereference(local->monitor_sdata); if (sdata) { vif = &sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { ieee80211_purge_tx_queue(&local->hw, skbs); return true; } else vif = NULL; break; case NL80211_IFTYPE_AP_VLAN: sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); fallthrough; default: vif = &sdata->vif; break; } result = ieee80211_tx_frags(local, vif, sta, skbs, txpending); WARN_ON_ONCE(!skb_queue_empty(skbs)); return result; } /* * Invoke TX handlers, return 0 on success and non-zero if the * frame was dropped or queued. * * The handlers are split into an early and late part. The latter is everything * that can be sensitive to reordering, and will be deferred to after packets * are dequeued from the intermediate queues (when they are enabled). */ static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) { ieee80211_tx_result res = TX_DROP; #define CALL_TXH(txh) \ do { \ res = txh(tx); \ if (res != TX_CONTINUE) \ goto txh_done; \ } while (0) CALL_TXH(ieee80211_tx_h_dynamic_ps); CALL_TXH(ieee80211_tx_h_check_assoc); CALL_TXH(ieee80211_tx_h_ps_buf); CALL_TXH(ieee80211_tx_h_check_control_port_protocol); CALL_TXH(ieee80211_tx_h_select_key); txh_done: if (unlikely(res == TX_DROP)) { I802_DEBUG_INC(tx->local->tx_handlers_drop); if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs); return -1; } else if (unlikely(res == TX_QUEUED)) { I802_DEBUG_INC(tx->local->tx_handlers_queued); return -1; } return 0; } /* * Late handlers can be called while the sta lock is held. Handlers that can * cause packets to be generated will cause deadlock! */ static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); ieee80211_tx_result res = TX_CONTINUE; if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_rate_ctrl); if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) { __skb_queue_tail(&tx->skbs, tx->skb); tx->skb = NULL; goto txh_done; } CALL_TXH(ieee80211_tx_h_michael_mic_add); CALL_TXH(ieee80211_tx_h_sequence); CALL_TXH(ieee80211_tx_h_fragment); /* handlers after fragment must be aware of tx info fragmentation! */ CALL_TXH(ieee80211_tx_h_stats); CALL_TXH(ieee80211_tx_h_encrypt); if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_calculate_duration); #undef CALL_TXH txh_done: if (unlikely(res == TX_DROP)) { I802_DEBUG_INC(tx->local->tx_handlers_drop); if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs); return -1; } else if (unlikely(res == TX_QUEUED)) { I802_DEBUG_INC(tx->local->tx_handlers_queued); return -1; } return 0; } static int invoke_tx_handlers(struct ieee80211_tx_data *tx) { int r = invoke_tx_handlers_early(tx); if (r) return r; return invoke_tx_handlers_late(tx); } bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb, int band, struct ieee80211_sta **sta) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_tx_data tx; struct sk_buff *skb2; if (ieee80211_tx_prepare(sdata, &tx, NULL, skb) == TX_DROP) return false; info->band = band; info->control.vif = vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; if (invoke_tx_handlers(&tx)) return false; if (sta) { if (tx.sta) *sta = &tx.sta->sta; else *sta = NULL; } /* this function isn't suitable for fragmented data frames */ skb2 = __skb_dequeue(&tx.skbs); if (WARN_ON(skb2 != skb || !skb_queue_empty(&tx.skbs))) { ieee80211_free_txskb(hw, skb2); ieee80211_purge_tx_queue(hw, &tx.skbs); return false; } return true; } EXPORT_SYMBOL(ieee80211_tx_prepare_skb); /* * Returns false if the frame couldn't be transmitted but was queued instead. */ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb, bool txpending) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_data tx; ieee80211_tx_result res_prepare; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); bool result = true; if (unlikely(skb->len < 10)) { dev_kfree_skb(skb); return true; } /* initialises tx */ res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); return true; } else if (unlikely(res_prepare == TX_QUEUED)) { return true; } /* set up hw_queue value early */ if (!(info->flags & IEEE80211_TX_CTL_TX_OFFCHAN) || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; if (invoke_tx_handlers_early(&tx)) return true; if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb)) return true; if (!invoke_tx_handlers_late(&tx)) result = __ieee80211_tx(local, &tx.skbs, tx.sta, txpending); return result; } /* device xmit handlers */ enum ieee80211_encrypt { ENCRYPT_NO, ENCRYPT_MGMT, ENCRYPT_DATA, }; static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int head_need, enum ieee80211_encrypt encrypt) { struct ieee80211_local *local = sdata->local; bool enc_tailroom; int tail_need = 0; enc_tailroom = encrypt == ENCRYPT_MGMT || (encrypt == ENCRYPT_DATA && sdata->crypto_tx_tailroom_needed_cnt); if (enc_tailroom) { tail_need = IEEE80211_ENCRYPT_TAILROOM; tail_need -= skb_tailroom(skb); tail_need = max_t(int, tail_need, 0); } if (skb_cloned(skb) && (!ieee80211_hw_check(&local->hw, SUPPORTS_CLONED_SKBS) || !skb_clone_writable(skb, ETH_HLEN) || enc_tailroom)) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); else if (head_need || tail_need) I802_DEBUG_INC(local->tx_expand_skb_head); else return 0; if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) { wiphy_debug(local->hw.wiphy, "failed to reallocate TX buffer\n"); return -ENOMEM; } return 0; } void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; int headroom; enum ieee80211_encrypt encrypt; if (info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT) encrypt = ENCRYPT_NO; else if (ieee80211_is_mgmt(hdr->frame_control)) encrypt = ENCRYPT_MGMT; else encrypt = ENCRYPT_DATA; headroom = local->tx_headroom; if (encrypt != ENCRYPT_NO) headroom += sdata->encrypt_headroom; headroom -= skb_headroom(skb); headroom = max_t(int, 0, headroom); if (ieee80211_skb_resize(sdata, skb, headroom, encrypt)) { ieee80211_free_txskb(&local->hw, skb); return; } /* reload after potential resize */ hdr = (struct ieee80211_hdr *) skb->data; info->control.vif = &sdata->vif; if (ieee80211_vif_is_mesh(&sdata->vif)) { if (ieee80211_is_data(hdr->frame_control) && is_unicast_ether_addr(hdr->addr1)) { if (mesh_nexthop_resolve(sdata, skb)) return; /* skb queued: don't free */ } else { ieee80211_mps_set_frame_flags(sdata, NULL, hdr); } } ieee80211_set_qos_hdr(sdata, skb); ieee80211_tx(sdata, sta, skb, false); } static bool ieee80211_validate_radiotap_len(struct sk_buff *skb) { struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *)skb->data; /* check for not even having the fixed radiotap header part */ if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) return false; /* too short to be possibly valid */ /* is it a header version we can trust to find length from? */ if (unlikely(rthdr->it_version)) return false; /* only version 0 is supported */ /* does the skb contain enough to deliver on the alleged length? */ if (unlikely(skb->len < ieee80211_get_radiotap_len(skb->data))) return false; /* skb too short for claimed rt header extent */ return true; } bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_radiotap_iterator iterator; struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len, NULL); u16 txflags; u16 rate = 0; bool rate_found = false; u8 rate_retries = 0; u16 rate_flags = 0; u8 mcs_known, mcs_flags, mcs_bw; u16 vht_known; u8 vht_mcs = 0, vht_nss = 0; int i; if (!ieee80211_validate_radiotap_len(skb)) return false; info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_CTL_DONTFRAG; /* * for every radiotap entry that is present * (ieee80211_radiotap_iterator_next returns -ENOENT when no more * entries present, or -EINVAL on error) */ while (!ret) { ret = ieee80211_radiotap_iterator_next(&iterator); if (ret) continue; /* see if this argument is something we can use */ switch (iterator.this_arg_index) { /* * You must take care when dereferencing iterator.this_arg * for multibyte types... the pointer is not aligned. Use * get_unaligned((type *)iterator.this_arg) to dereference * iterator.this_arg for type "type" safely on all arches. */ case IEEE80211_RADIOTAP_FLAGS: if (*iterator.this_arg & IEEE80211_RADIOTAP_F_FCS) { /* * this indicates that the skb we have been * handed has the 32-bit FCS CRC at the end... * we should react to that by snipping it off * because it will be recomputed and added * on transmission */ if (skb->len < (iterator._max_length + FCS_LEN)) return false; skb_trim(skb, skb->len - FCS_LEN); } if (*iterator.this_arg & IEEE80211_RADIOTAP_F_WEP) info->flags &= ~IEEE80211_TX_INTFL_DONT_ENCRYPT; if (*iterator.this_arg & IEEE80211_RADIOTAP_F_FRAG) info->flags &= ~IEEE80211_TX_CTL_DONTFRAG; break; case IEEE80211_RADIOTAP_TX_FLAGS: txflags = get_unaligned_le16(iterator.this_arg); if (txflags & IEEE80211_RADIOTAP_F_TX_NOACK) info->flags |= IEEE80211_TX_CTL_NO_ACK; if (txflags & IEEE80211_RADIOTAP_F_TX_NOSEQNO) info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; if (txflags & IEEE80211_RADIOTAP_F_TX_ORDER) info->control.flags |= IEEE80211_TX_CTRL_DONT_REORDER; break; case IEEE80211_RADIOTAP_RATE: rate = *iterator.this_arg; rate_flags = 0; rate_found = true; break; case IEEE80211_RADIOTAP_DATA_RETRIES: rate_retries = *iterator.this_arg; break; case IEEE80211_RADIOTAP_MCS: mcs_known = iterator.this_arg[0]; mcs_flags = iterator.this_arg[1]; if (!(mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_MCS)) break; rate_found = true; rate = iterator.this_arg[2]; rate_flags = IEEE80211_TX_RC_MCS; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_GI && mcs_flags & IEEE80211_RADIOTAP_MCS_SGI) rate_flags |= IEEE80211_TX_RC_SHORT_GI; mcs_bw = mcs_flags & IEEE80211_RADIOTAP_MCS_BW_MASK; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW && mcs_bw == IEEE80211_RADIOTAP_MCS_BW_40) rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_FEC && mcs_flags & IEEE80211_RADIOTAP_MCS_FEC_LDPC) info->flags |= IEEE80211_TX_CTL_LDPC; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_STBC) { u8 stbc = u8_get_bits(mcs_flags, IEEE80211_RADIOTAP_MCS_STBC_MASK); info->flags |= u32_encode_bits(stbc, IEEE80211_TX_CTL_STBC); } break; case IEEE80211_RADIOTAP_VHT: vht_known = get_unaligned_le16(iterator.this_arg); rate_found = true; rate_flags = IEEE80211_TX_RC_VHT_MCS; if ((vht_known & IEEE80211_RADIOTAP_VHT_KNOWN_GI) && (iterator.this_arg[2] & IEEE80211_RADIOTAP_VHT_FLAG_SGI)) rate_flags |= IEEE80211_TX_RC_SHORT_GI; if (vht_known & IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH) { if (iterator.this_arg[3] == 1) rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; else if (iterator.this_arg[3] == 4) rate_flags |= IEEE80211_TX_RC_80_MHZ_WIDTH; else if (iterator.this_arg[3] == 11) rate_flags |= IEEE80211_TX_RC_160_MHZ_WIDTH; } vht_mcs = iterator.this_arg[4] >> 4; if (vht_mcs > 11) vht_mcs = 0; vht_nss = iterator.this_arg[4] & 0xF; if (!vht_nss || vht_nss > 8) vht_nss = 1; break; /* * Please update the file * Documentation/networking/mac80211-injection.rst * when parsing new fields here. */ default: break; } } if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */ return false; if (rate_found) { struct ieee80211_supported_band *sband = local->hw.wiphy->bands[info->band]; info->control.flags |= IEEE80211_TX_CTRL_RATE_INJECT; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { info->control.rates[i].idx = -1; info->control.rates[i].flags = 0; info->control.rates[i].count = 0; } if (rate_flags & IEEE80211_TX_RC_MCS) { info->control.rates[0].idx = rate; } else if (rate_flags & IEEE80211_TX_RC_VHT_MCS) { ieee80211_rate_set_vht(info->control.rates, vht_mcs, vht_nss); } else if (sband) { for (i = 0; i < sband->n_bitrates; i++) { if (rate * 5 != sband->bitrates[i].bitrate) continue; info->control.rates[0].idx = i; break; } } if (info->control.rates[0].idx < 0) info->control.flags &= ~IEEE80211_TX_CTRL_RATE_INJECT; info->control.rates[0].flags = rate_flags; info->control.rates[0].count = min_t(u8, rate_retries + 1, local->hw.max_rate_tries); } return true; } netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; struct ieee80211_sub_if_data *tmp_sdata, *sdata; struct cfg80211_chan_def *chandef; u16 len_rthdr; int hdrlen; memset(info, 0, sizeof(*info)); info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_CTL_INJECTED; /* Sanity-check the length of the radiotap header */ if (!ieee80211_validate_radiotap_len(skb)) goto fail; /* we now know there is a radiotap header with a length we can use */ len_rthdr = ieee80211_get_radiotap_len(skb->data); /* * fix up the pointers accounting for the radiotap * header still being in there. We are being given * a precooked IEEE80211 header so no need for * normal processing */ skb_set_mac_header(skb, len_rthdr); /* * these are just fixed to the end of the rt area since we * don't have any better information and at this point, nobody cares */ skb_set_network_header(skb, len_rthdr); skb_set_transport_header(skb, len_rthdr); if (skb->len < len_rthdr + 2) goto fail; hdr = (struct ieee80211_hdr *)(skb->data + len_rthdr); hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < len_rthdr + hdrlen) goto fail; /* * Initialize skb->protocol if the injected frame is a data frame * carrying a rfc1042 header */ if (ieee80211_is_data(hdr->frame_control) && skb->len >= len_rthdr + hdrlen + sizeof(rfc1042_header) + 2) { u8 *payload = (u8 *)hdr + hdrlen; if (ether_addr_equal(payload, rfc1042_header)) skb->protocol = cpu_to_be16((payload[6] << 8) | payload[7]); } rcu_read_lock(); /* * We process outgoing injected frames that have a local address * we handle as though they are non-injected frames. * This code here isn't entirely correct, the local MAC address * isn't always enough to find the interface to use; for proper * VLAN support we have an nl80211-based mechanism. * * This is necessary, for example, for old hostapd versions that * don't use nl80211-based management TX/RX. */ sdata = IEEE80211_DEV_TO_SUB_IF(dev); list_for_each_entry_rcu(tmp_sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(tmp_sdata)) continue; if (tmp_sdata->vif.type == NL80211_IFTYPE_MONITOR || tmp_sdata->vif.type == NL80211_IFTYPE_AP_VLAN) continue; if (ether_addr_equal(tmp_sdata->vif.addr, hdr->addr2)) { sdata = tmp_sdata; break; } } chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!chanctx_conf) { tmp_sdata = rcu_dereference(local->monitor_sdata); if (tmp_sdata) chanctx_conf = rcu_dereference(tmp_sdata->vif.chanctx_conf); } if (chanctx_conf) chandef = &chanctx_conf->def; else if (!local->use_chanctx) chandef = &local->_oper_chandef; else goto fail_rcu; /* * Frame injection is not allowed if beaconing is not allowed * or if we need radar detection. Beaconing is usually not allowed when * the mode or operation (Adhoc, AP, Mesh) does not support DFS. * Passive scan is also used in world regulatory domains where * your country is not known and as such it should be treated as * NO TX unless the channel is explicitly allowed in which case * your current regulatory domain would not have the passive scan * flag. * * Since AP mode uses monitor interfaces to inject/TX management * frames we can make AP mode the exception to this rule once it * supports radar detection as its implementation can deal with * radar detection by itself. We can do that later by adding a * monitor flag interfaces used for AP support. */ if (!cfg80211_reg_can_beacon(local->hw.wiphy, chandef, sdata->vif.type)) goto fail_rcu; info->band = chandef->chan->band; /* Initialize skb->priority according to frame type and TID class, * with respect to the sub interface that the frame will actually * be transmitted on. If the DONT_REORDER flag is set, the original * skb-priority is preserved to assure frames injected with this * flag are not reordered relative to each other. */ ieee80211_select_queue_80211(sdata, skb, hdr); skb_set_queue_mapping(skb, ieee80211_ac_from_tid(skb->priority)); /* * Process the radiotap header. This will now take into account the * selected chandef above to accurately set injection rates and * retransmissions. */ if (!ieee80211_parse_tx_radiotap(skb, dev)) goto fail_rcu; /* remove the injection radiotap header */ skb_pull(skb, len_rthdr); ieee80211_xmit(sdata, NULL, skb); rcu_read_unlock(); return NETDEV_TX_OK; fail_rcu: rcu_read_unlock(); fail: dev_kfree_skb(skb); return NETDEV_TX_OK; /* meaning, we dealt with the skb */ } static inline bool ieee80211_is_tdls_setup(struct sk_buff *skb) { u16 ethertype = (skb->data[12] << 8) | skb->data[13]; return ethertype == ETH_P_TDLS && skb->len > 14 && skb->data[14] == WLAN_TDLS_SNAP_RFTYPE; } int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info **sta_out) { struct sta_info *sta; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: sta = rcu_dereference(sdata->u.vlan.sta); if (sta) { *sta_out = sta; return 0; } else if (sdata->wdev.use_4addr) { return -ENOLINK; } fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_ADHOC: if (is_multicast_ether_addr(skb->data)) { *sta_out = ERR_PTR(-ENOENT); return 0; } sta = sta_info_get_bss(sdata, skb->data); break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: /* determined much later */ *sta_out = NULL; return 0; #endif case NL80211_IFTYPE_STATION: if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) { sta = sta_info_get(sdata, skb->data); if (sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) { *sta_out = sta; return 0; } /* * TDLS link during setup - throw out frames to * peer. Allow TDLS-setup frames to unauthorized * peers for the special case of a link teardown * after a TDLS sta is removed due to being * unreachable. */ if (!ieee80211_is_tdls_setup(skb)) return -EINVAL; } } sta = sta_info_get(sdata, sdata->u.mgd.bssid); if (!sta) return -ENOLINK; break; default: return -EINVAL; } *sta_out = sta ?: ERR_PTR(-ENOENT); return 0; } static u16 ieee80211_store_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u32 *info_flags, u64 *cookie) { struct sk_buff *ack_skb; u16 info_id = 0; if (skb->sk) ack_skb = skb_clone_sk(skb); else ack_skb = skb_clone(skb, GFP_ATOMIC); if (ack_skb) { unsigned long flags; int id; spin_lock_irqsave(&local->ack_status_lock, flags); id = idr_alloc(&local->ack_status_frames, ack_skb, 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (id >= 0) { info_id = id; *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; if (cookie) { *cookie = ieee80211_mgmt_tx_cookie(local); IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; } } else { kfree_skb(ack_skb); } } return info_id; } /** * ieee80211_build_hdr - build 802.11 header in the given frame * @sdata: virtual interface to build the header for * @skb: the skb to build the header in * @info_flags: skb flags to set * @sta: the station pointer * @ctrl_flags: info control flags to set * @cookie: cookie pointer to fill (if not %NULL) * * This function takes the skb with 802.3 header and reformats the header to * the appropriate IEEE 802.11 header based on which interface the packet is * being transmitted on. * * Note that this function also takes care of the TX status request and * potential unsharing of the SKB - this needs to be interleaved with the * header building. * * The function requires the read-side RCU lock held * * Returns: the (possibly reallocated) skb or an ERR_PTR() code */ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags, struct sta_info *sta, u32 ctrl_flags, u64 *cookie) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info; int head_need; u16 ethertype, hdrlen, meshhdrlen = 0; __le16 fc; struct ieee80211_hdr hdr; struct ieee80211s_hdr mesh_hdr __maybe_unused; struct mesh_path __maybe_unused *mppath = NULL, *mpath = NULL; const u8 *encaps_data; int encaps_len, skip_header_bytes; bool wme_sta = false, authorized = false; bool tdls_peer; bool multicast; u16 info_id = 0; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_sub_if_data *ap_sdata; enum nl80211_band band; int ret; if (IS_ERR(sta)) sta = NULL; #ifdef CONFIG_MAC80211_DEBUGFS if (local->force_tx_status) info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; #endif /* convert Ethernet header to proper 802.11 header (based on * operation mode) */ ethertype = (skb->data[12] << 8) | skb->data[13]; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: if (sdata->wdev.use_4addr) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr.addr1, sta->sta.addr, ETH_ALEN); memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 30; authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); wme_sta = sta->sta.wme; } ap_sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); chanctx_conf = rcu_dereference(ap_sdata->vif.chanctx_conf); if (!chanctx_conf) { ret = -ENOTCONN; goto free; } band = chanctx_conf->def.chan->band; if (sdata->wdev.use_4addr) break; fallthrough; case NL80211_IFTYPE_AP: if (sdata->vif.type == NL80211_IFTYPE_AP) chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!chanctx_conf) { ret = -ENOTCONN; goto free; } fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); /* DA BSSID SA */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 24; band = chanctx_conf->def.chan->band; break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: if (!is_multicast_ether_addr(skb->data)) { struct sta_info *next_hop; bool mpp_lookup = true; mpath = mesh_path_lookup(sdata, skb->data); if (mpath) { mpp_lookup = false; next_hop = rcu_dereference(mpath->next_hop); if (!next_hop || !(mpath->flags & (MESH_PATH_ACTIVE | MESH_PATH_RESOLVING))) mpp_lookup = true; } if (mpp_lookup) { mppath = mpp_path_lookup(sdata, skb->data); if (mppath) mppath->exp_time = jiffies; } if (mppath && mpath) mesh_path_del(sdata, mpath->dst); } /* * Use address extension if it is a packet from * another interface or if we know the destination * is being proxied by a portal (i.e. portal address * differs from proxied address) */ if (ether_addr_equal(sdata->vif.addr, skb->data + ETH_ALEN) && !(mppath && !ether_addr_equal(mppath->mpp, skb->data))) { hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, skb->data, skb->data + ETH_ALEN); meshhdrlen = ieee80211_new_mesh_header(sdata, &mesh_hdr, NULL, NULL); } else { /* DS -> MBSS (802.11-2012 13.11.3.3). * For unicast with unknown forwarding information, * destination might be in the MBSS or if that fails * forwarded to another mesh gate. In either case * resolution will be handled in ieee80211_xmit(), so * leave the original DA. This also works for mcast */ const u8 *mesh_da = skb->data; if (mppath) mesh_da = mppath->mpp; else if (mpath) mesh_da = mpath->dst; hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, mesh_da, sdata->vif.addr); if (is_multicast_ether_addr(mesh_da)) /* DA TA mSA AE:SA */ meshhdrlen = ieee80211_new_mesh_header( sdata, &mesh_hdr, skb->data + ETH_ALEN, NULL); else /* RA TA mDA mSA AE:DA SA */ meshhdrlen = ieee80211_new_mesh_header( sdata, &mesh_hdr, skb->data, skb->data + ETH_ALEN); } chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!chanctx_conf) { ret = -ENOTCONN; goto free; } band = chanctx_conf->def.chan->band; /* For injected frames, fill RA right away as nexthop lookup * will be skipped. */ if ((ctrl_flags & IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP) && is_zero_ether_addr(hdr.addr1)) memcpy(hdr.addr1, skb->data, ETH_ALEN); break; #endif case NL80211_IFTYPE_STATION: /* we already did checks when looking up the RA STA */ tdls_peer = test_sta_flag(sta, WLAN_STA_TDLS_PEER); if (tdls_peer) { /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); memcpy(hdr.addr3, sdata->u.mgd.bssid, ETH_ALEN); hdrlen = 24; } else if (sdata->u.mgd.use_4addr && cpu_to_be16(ethertype) != sdata->control_port_protocol) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN); memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 30; } else { fc |= cpu_to_le16(IEEE80211_FCTL_TODS); /* BSSID SA DA */ memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); hdrlen = 24; } chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!chanctx_conf) { ret = -ENOTCONN; goto free; } band = chanctx_conf->def.chan->band; break; case NL80211_IFTYPE_OCB: /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); eth_broadcast_addr(hdr.addr3); hdrlen = 24; chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!chanctx_conf) { ret = -ENOTCONN; goto free; } band = chanctx_conf->def.chan->band; break; case NL80211_IFTYPE_ADHOC: /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN); hdrlen = 24; chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!chanctx_conf) { ret = -ENOTCONN; goto free; } band = chanctx_conf->def.chan->band; break; default: ret = -EINVAL; goto free; } multicast = is_multicast_ether_addr(hdr.addr1); /* sta is always NULL for mesh */ if (sta) { authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); wme_sta = sta->sta.wme; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { /* For mesh, the use of the QoS header is mandatory */ wme_sta = true; } /* receiver does QoS (which also means we do) use it */ if (wme_sta) { fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); hdrlen += 2; } /* * Drop unicast frames to unauthorised stations unless they are * EAPOL frames from the local station. */ if (unlikely(!ieee80211_vif_is_mesh(&sdata->vif) && (sdata->vif.type != NL80211_IFTYPE_OCB) && !multicast && !authorized && (cpu_to_be16(ethertype) != sdata->control_port_protocol || !ether_addr_equal(sdata->vif.addr, skb->data + ETH_ALEN)))) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG net_info_ratelimited("%s: dropped frame to %pM (unauthorized port)\n", sdata->name, hdr.addr1); #endif I802_DEBUG_INC(local->tx_handlers_drop_unauth_port); ret = -EPERM; goto free; } if (unlikely(!multicast && ((skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) || ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS))) info_id = ieee80211_store_ack_skb(local, skb, &info_flags, cookie); /* * If the skb is shared we need to obtain our own copy. */ if (skb_shared(skb)) { struct sk_buff *tmp_skb = skb; /* can't happen -- skb is a clone if info_id != 0 */ WARN_ON(info_id); skb = skb_clone(skb, GFP_ATOMIC); kfree_skb(tmp_skb); if (!skb) { ret = -ENOMEM; goto free; } } hdr.frame_control = fc; hdr.duration_id = 0; hdr.seq_ctrl = 0; skip_header_bytes = ETH_HLEN; if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) { encaps_data = bridge_tunnel_header; encaps_len = sizeof(bridge_tunnel_header); skip_header_bytes -= 2; } else if (ethertype >= ETH_P_802_3_MIN) { encaps_data = rfc1042_header; encaps_len = sizeof(rfc1042_header); skip_header_bytes -= 2; } else { encaps_data = NULL; encaps_len = 0; } skb_pull(skb, skip_header_bytes); head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb); /* * So we need to modify the skb header and hence need a copy of * that. The head_need variable above doesn't, so far, include * the needed header space that we don't need right away. If we * can, then we don't reallocate right now but only after the * frame arrives at the master device (if it does...) * * If we cannot, however, then we will reallocate to include all * the ever needed space. Also, if we need to reallocate it anyway, * make it big enough for everything we may ever need. */ if (head_need > 0 || skb_cloned(skb)) { head_need += sdata->encrypt_headroom; head_need += local->tx_headroom; head_need = max_t(int, 0, head_need); if (ieee80211_skb_resize(sdata, skb, head_need, ENCRYPT_DATA)) { ieee80211_free_txskb(&local->hw, skb); skb = NULL; return ERR_PTR(-ENOMEM); } } if (encaps_data) memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len); #ifdef CONFIG_MAC80211_MESH if (meshhdrlen > 0) memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen); #endif if (ieee80211_is_data_qos(fc)) { __le16 *qos_control; qos_control = skb_push(skb, 2); memcpy(skb_push(skb, hdrlen - 2), &hdr, hdrlen - 2); /* * Maybe we could actually set some fields here, for now just * initialise to zero to indicate no special operation. */ *qos_control = 0; } else memcpy(skb_push(skb, hdrlen), &hdr, hdrlen); skb_reset_mac_header(skb); info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); info->flags = info_flags; info->ack_frame_id = info_id; info->band = band; info->control.flags = ctrl_flags; return skb; free: kfree_skb(skb); return ERR_PTR(ret); } /* * fast-xmit overview * * The core idea of this fast-xmit is to remove per-packet checks by checking * them out of band. ieee80211_check_fast_xmit() implements the out-of-band * checks that are needed to get the sta->fast_tx pointer assigned, after which * much less work can be done per packet. For example, fragmentation must be * disabled or the fast_tx pointer will not be set. All the conditions are seen * in the code here. * * Once assigned, the fast_tx data structure also caches the per-packet 802.11 * header and other data to aid packet processing in ieee80211_xmit_fast(). * * The most difficult part of this is that when any of these assumptions * change, an external trigger (i.e. a call to ieee80211_clear_fast_xmit(), * ieee80211_check_fast_xmit() or friends) is required to reset the data, * since the per-packet code no longer checks the conditions. This is reflected * by the calls to these functions throughout the rest of the code, and must be * maintained if any of the TX path checks change. */ void ieee80211_check_fast_xmit(struct sta_info *sta) { struct ieee80211_fast_tx build = {}, *fast_tx = NULL, *old; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_hdr *hdr = (void *)build.hdr; struct ieee80211_chanctx_conf *chanctx_conf; __le16 fc; if (!ieee80211_hw_check(&local->hw, SUPPORT_FAST_XMIT)) return; /* Locking here protects both the pointer itself, and against concurrent * invocations winning data access races to, e.g., the key pointer that * is used. * Without it, the invocation of this function right after the key * pointer changes wouldn't be sufficient, as another CPU could access * the pointer, then stall, and then do the cache update after the CPU * that invalidated the key. * With the locking, such scenarios cannot happen as the check for the * key and the fast-tx assignment are done atomically, so the CPU that * modifies the key will either wait or other one will see the key * cleared/changed already. */ spin_lock_bh(&sta->lock); if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS) && sdata->vif.type == NL80211_IFTYPE_STATION) goto out; if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED) || !sta->uploaded) goto out; if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER) || test_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT)) goto out; if (sdata->noack_map) goto out; /* fast-xmit doesn't handle fragmentation at all */ if (local->hw.wiphy->frag_threshold != (u32)-1 && !ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG)) goto out; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); goto out; } build.band = chanctx_conf->def.chan->band; rcu_read_unlock(); fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); switch (sdata->vif.type) { case NL80211_IFTYPE_ADHOC: /* DA SA BSSID */ build.da_offs = offsetof(struct ieee80211_hdr, addr1); build.sa_offs = offsetof(struct ieee80211_hdr, addr2); memcpy(hdr->addr3, sdata->u.ibss.bssid, ETH_ALEN); build.hdr_len = 24; break; case NL80211_IFTYPE_STATION: if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { /* DA SA BSSID */ build.da_offs = offsetof(struct ieee80211_hdr, addr1); build.sa_offs = offsetof(struct ieee80211_hdr, addr2); memcpy(hdr->addr3, sdata->u.mgd.bssid, ETH_ALEN); build.hdr_len = 24; break; } if (sdata->u.mgd.use_4addr) { /* non-regular ethertype cannot use the fastpath */ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN); memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); build.da_offs = offsetof(struct ieee80211_hdr, addr3); build.sa_offs = offsetof(struct ieee80211_hdr, addr4); build.hdr_len = 30; break; } fc |= cpu_to_le16(IEEE80211_FCTL_TODS); /* BSSID SA DA */ memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN); build.da_offs = offsetof(struct ieee80211_hdr, addr3); build.sa_offs = offsetof(struct ieee80211_hdr, addr2); build.hdr_len = 24; break; case NL80211_IFTYPE_AP_VLAN: if (sdata->wdev.use_4addr) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN); memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); build.da_offs = offsetof(struct ieee80211_hdr, addr3); build.sa_offs = offsetof(struct ieee80211_hdr, addr4); build.hdr_len = 30; break; } fallthrough; case NL80211_IFTYPE_AP: fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); /* DA BSSID SA */ build.da_offs = offsetof(struct ieee80211_hdr, addr1); memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); build.sa_offs = offsetof(struct ieee80211_hdr, addr3); build.hdr_len = 24; break; default: /* not handled on fast-xmit */ goto out; } if (sta->sta.wme) { build.hdr_len += 2; fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); } /* We store the key here so there's no point in using rcu_dereference() * but that's fine because the code that changes the pointers will call * this function after doing so. For a single CPU that would be enough, * for multiple see the comment above. */ build.key = rcu_access_pointer(sta->ptk[sta->ptk_idx]); if (!build.key) build.key = rcu_access_pointer(sdata->default_unicast_key); if (build.key) { bool gen_iv, iv_spc, mmic; gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV; iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE; mmic = build.key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE); /* don't handle software crypto */ if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) goto out; /* Key is being removed */ if (build.key->flags & KEY_FLAG_TAINTED) goto out; switch (build.key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: if (gen_iv) build.pn_offs = build.hdr_len; if (gen_iv || iv_spc) build.hdr_len += IEEE80211_CCMP_HDR_LEN; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: if (gen_iv) build.pn_offs = build.hdr_len; if (gen_iv || iv_spc) build.hdr_len += IEEE80211_GCMP_HDR_LEN; break; case WLAN_CIPHER_SUITE_TKIP: /* cannot handle MMIC or IV generation in xmit-fast */ if (mmic || gen_iv) goto out; if (iv_spc) build.hdr_len += IEEE80211_TKIP_IV_LEN; break; case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: /* cannot handle IV generation in fast-xmit */ if (gen_iv) goto out; if (iv_spc) build.hdr_len += IEEE80211_WEP_IV_LEN; break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: WARN(1, "management cipher suite 0x%x enabled for data\n", build.key->conf.cipher); goto out; default: /* we don't know how to generate IVs for this at all */ if (WARN_ON(gen_iv)) goto out; /* pure hardware keys are OK, of course */ if (!(build.key->flags & KEY_FLAG_CIPHER_SCHEME)) break; /* cipher scheme might require space allocation */ if (iv_spc && build.key->conf.iv_len > IEEE80211_FAST_XMIT_MAX_IV) goto out; if (iv_spc) build.hdr_len += build.key->conf.iv_len; } fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); } hdr->frame_control = fc; memcpy(build.hdr + build.hdr_len, rfc1042_header, sizeof(rfc1042_header)); build.hdr_len += sizeof(rfc1042_header); fast_tx = kmemdup(&build, sizeof(build), GFP_ATOMIC); /* if the kmemdup fails, continue w/o fast_tx */ if (!fast_tx) goto out; out: /* we might have raced against another call to this function */ old = rcu_dereference_protected(sta->fast_tx, lockdep_is_held(&sta->lock)); rcu_assign_pointer(sta->fast_tx, fast_tx); if (old) kfree_rcu(old, rcu_head); spin_unlock_bh(&sta->lock); } void ieee80211_check_fast_xmit_all(struct ieee80211_local *local) { struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) ieee80211_check_fast_xmit(sta); rcu_read_unlock(); } void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata && (!sta->sdata->bss || sta->sdata->bss != sdata->bss)) continue; ieee80211_check_fast_xmit(sta); } rcu_read_unlock(); } void ieee80211_clear_fast_xmit(struct sta_info *sta) { struct ieee80211_fast_tx *fast_tx; spin_lock_bh(&sta->lock); fast_tx = rcu_dereference_protected(sta->fast_tx, lockdep_is_held(&sta->lock)); RCU_INIT_POINTER(sta->fast_tx, NULL); spin_unlock_bh(&sta->lock); if (fast_tx) kfree_rcu(fast_tx, rcu_head); } static bool ieee80211_amsdu_realloc_pad(struct ieee80211_local *local, struct sk_buff *skb, int headroom) { if (skb_headroom(skb) < headroom) { I802_DEBUG_INC(local->tx_expand_skb_head); if (pskb_expand_head(skb, headroom, 0, GFP_ATOMIC)) { wiphy_debug(local->hw.wiphy, "failed to reallocate TX buffer\n"); return false; } } return true; } static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; struct ethhdr *amsdu_hdr; int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header); int subframe_len = skb->len - hdr_len; void *data; u8 *qc, *h_80211_src, *h_80211_dst; const u8 *bssid; if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) return false; if (info->control.flags & IEEE80211_TX_CTRL_AMSDU) return true; if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr) + local->hw.extra_tx_headroom)) return false; data = skb_push(skb, sizeof(*amsdu_hdr)); memmove(data, data + sizeof(*amsdu_hdr), hdr_len); hdr = data; amsdu_hdr = data + hdr_len; /* h_80211_src/dst is addr* field within hdr */ h_80211_src = data + fast_tx->sa_offs; h_80211_dst = data + fast_tx->da_offs; amsdu_hdr->h_proto = cpu_to_be16(subframe_len); ether_addr_copy(amsdu_hdr->h_source, h_80211_src); ether_addr_copy(amsdu_hdr->h_dest, h_80211_dst); /* according to IEEE 802.11-2012 8.3.2 table 8-19, the outer SA/DA * fields needs to be changed to BSSID for A-MSDU frames depending * on FromDS/ToDS values. */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: bssid = sdata->u.mgd.bssid; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: bssid = sdata->vif.addr; break; default: bssid = NULL; } if (bssid && ieee80211_has_fromds(hdr->frame_control)) ether_addr_copy(h_80211_src, bssid); if (bssid && ieee80211_has_tods(hdr->frame_control)) ether_addr_copy(h_80211_dst, bssid); qc = ieee80211_get_qos_ctl(hdr); *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT; info->control.flags |= IEEE80211_TX_CTRL_AMSDU; return true; } static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct fq *fq = &local->fq; struct fq_tin *tin; struct fq_flow *flow; u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; struct ieee80211_txq *txq = sta->sta.txq[tid]; struct txq_info *txqi; struct sk_buff **frag_tail, *head; int subframe_len = skb->len - ETH_ALEN; u8 max_subframes = sta->sta.max_amsdu_subframes; int max_frags = local->hw.max_tx_fragments; int max_amsdu_len = sta->sta.max_amsdu_len; int orig_truesize; u32 flow_idx; __be16 len; void *data; bool ret = false; unsigned int orig_len; int n = 2, nfrags, pad = 0; u16 hdrlen; if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) return false; if (sdata->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED) return false; if (skb_is_gso(skb)) return false; if (!txq) return false; txqi = to_txq_info(txq); if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags)) return false; if (sta->sta.max_rc_amsdu_len) max_amsdu_len = min_t(int, max_amsdu_len, sta->sta.max_rc_amsdu_len); if (sta->sta.max_tid_amsdu_len[tid]) max_amsdu_len = min_t(int, max_amsdu_len, sta->sta.max_tid_amsdu_len[tid]); flow_idx = fq_flow_idx(fq, skb); spin_lock_bh(&fq->lock); /* TODO: Ideally aggregation should be done on dequeue to remain * responsive to environment changes. */ tin = &txqi->tin; flow = fq_flow_classify(fq, tin, flow_idx, skb); head = skb_peek_tail(&flow->queue); if (!head || skb_is_gso(head)) goto out; orig_truesize = head->truesize; orig_len = head->len; if (skb->len + head->len > max_amsdu_len) goto out; nfrags = 1 + skb_shinfo(skb)->nr_frags; nfrags += 1 + skb_shinfo(head)->nr_frags; frag_tail = &skb_shinfo(head)->frag_list; while (*frag_tail) { nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags; frag_tail = &(*frag_tail)->next; n++; } if (max_subframes && n > max_subframes) goto out; if (max_frags && nfrags > max_frags) goto out; if (!drv_can_aggregate_in_amsdu(local, head, skb)) goto out; if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) goto out; /* If n == 2, the "while (*frag_tail)" loop above didn't execute * and frag_tail should be &skb_shinfo(head)->frag_list. * However, ieee80211_amsdu_prepare_head() can reallocate it. * Reload frag_tail to have it pointing to the correct place. */ if (n == 2) frag_tail = &skb_shinfo(head)->frag_list; /* * Pad out the previous subframe to a multiple of 4 by adding the * padding to the next one, that's being added. Note that head->len * is the length of the full A-MSDU, but that works since each time * we add a new subframe we pad out the previous one to a multiple * of 4 and thus it no longer matters in the next round. */ hdrlen = fast_tx->hdr_len - sizeof(rfc1042_header); if ((head->len - hdrlen) & 3) pad = 4 - ((head->len - hdrlen) & 3); if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2 + pad)) goto out_recalc; ret = true; data = skb_push(skb, ETH_ALEN + 2); memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN); data += 2 * ETH_ALEN; len = cpu_to_be16(subframe_len); memcpy(data, &len, 2); memcpy(data + 2, rfc1042_header, sizeof(rfc1042_header)); memset(skb_push(skb, pad), 0, pad); head->len += skb->len; head->data_len += skb->len; *frag_tail = skb; out_recalc: fq->memory_usage += head->truesize - orig_truesize; if (head->len != orig_len) { flow->backlog += head->len - orig_len; tin->backlog_bytes += head->len - orig_len; } out: spin_unlock_bh(&fq->lock); return ret; } /* * Can be called while the sta lock is held. Anything that can cause packets to * be generated will cause deadlock! */ static ieee80211_tx_result ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 pn_offs, struct ieee80211_key *key, struct ieee80211_tx_data *tx) { struct sk_buff *skb = tx->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; u8 tid = IEEE80211_NUM_TIDS; if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL) && ieee80211_tx_h_rate_ctrl(tx) != TX_CONTINUE) return TX_DROP; if (key) info->control.hw_key = &key->conf; dev_sw_netstats_tx_add(skb->dev, 1, skb->len); if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); } else { info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); sdata->sequence_number += 0x10; } if (skb_shinfo(skb)->gso_size) sta->tx_stats.msdu[tid] += DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size); else sta->tx_stats.msdu[tid]++; info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; /* statistics normally done by ieee80211_tx_h_stats (but that * has to consider fragmentation, so is more complex) */ sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; if (pn_offs) { u64 pn; u8 *crypto_hdr = skb->data + pn_offs; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: pn = atomic64_inc_return(&key->conf.tx_pn); crypto_hdr[0] = pn; crypto_hdr[1] = pn >> 8; crypto_hdr[3] = 0x20 | (key->conf.keyidx << 6); crypto_hdr[4] = pn >> 16; crypto_hdr[5] = pn >> 24; crypto_hdr[6] = pn >> 32; crypto_hdr[7] = pn >> 40; break; } } return TX_CONTINUE; } static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; u16 ethertype = (skb->data[12] << 8) | skb->data[13]; int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); int hw_headroom = sdata->local->hw.extra_tx_headroom; struct ethhdr eth; struct ieee80211_tx_info *info; struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; struct ieee80211_tx_data tx; ieee80211_tx_result r; struct tid_ampdu_tx *tid_tx = NULL; u8 tid = IEEE80211_NUM_TIDS; /* control port protocol needs a lot of special handling */ if (cpu_to_be16(ethertype) == sdata->control_port_protocol) return false; /* only RFC 1042 SNAP */ if (ethertype < ETH_P_802_3_MIN) return false; /* don't handle TX status request here either */ if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) return false; if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (tid_tx) { if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) return false; if (tid_tx->timeout) tid_tx->last_tx = jiffies; } } /* after this point (skb is modified) we cannot return false */ if (skb_shared(skb)) { struct sk_buff *tmp_skb = skb; skb = skb_clone(skb, GFP_ATOMIC); kfree_skb(tmp_skb); if (!skb) return true; } if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) && ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb)) return true; /* will not be crypto-handled beyond what we do here, so use false * as the may-encrypt argument for the resize to not account for * more room than we already have in 'extra_head' */ if (unlikely(ieee80211_skb_resize(sdata, skb, max_t(int, extra_head + hw_headroom - skb_headroom(skb), 0), ENCRYPT_NO))) { kfree_skb(skb); return true; } memcpy(ð, skb->data, ETH_HLEN - 2); hdr = skb_push(skb, extra_head); memcpy(skb->data, fast_tx->hdr, fast_tx->hdr_len); memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN); memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN); info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); info->band = fast_tx->band; info->control.vif = &sdata->vif; info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | IEEE80211_TX_CTL_DONTFRAG | (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0); info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT; #ifdef CONFIG_MAC80211_DEBUGFS if (local->force_tx_status) info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; #endif if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; *ieee80211_get_qos_ctl(hdr) = tid; } __skb_queue_head_init(&tx.skbs); tx.flags = IEEE80211_TX_UNICAST; tx.local = local; tx.sdata = sdata; tx.sta = sta; tx.key = fast_tx->key; if (ieee80211_queue_skb(local, sdata, sta, skb)) return true; tx.skb = skb; r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, fast_tx->key, &tx); tx.skb = NULL; if (r == TX_DROP) { kfree_skb(skb); return true; } if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); __skb_queue_tail(&tx.skbs, skb); ieee80211_tx_frags(local, &sdata->vif, sta, &tx.skbs, false); return true; } struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = container_of(txq, struct txq_info, txq); struct ieee80211_hdr *hdr; struct sk_buff *skb = NULL; struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; struct ieee80211_tx_info *info; struct ieee80211_tx_data tx; ieee80211_tx_result r; struct ieee80211_vif *vif = txq->vif; WARN_ON_ONCE(softirq_count() == 0); if (!ieee80211_txq_airtime_check(hw, txq)) return NULL; begin: spin_lock_bh(&fq->lock); if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags) || test_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags)) goto out; if (vif->txqs_stopped[txq->ac]) { set_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags); goto out; } /* Make sure fragments stay together. */ skb = __skb_dequeue(&txqi->frags); if (unlikely(skb)) { if (!(IEEE80211_SKB_CB(skb)->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING)) goto out; IEEE80211_SKB_CB(skb)->control.flags &= ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING; } else { skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); } if (!skb) goto out; spin_unlock_bh(&fq->lock); hdr = (struct ieee80211_hdr *)skb->data; info = IEEE80211_SKB_CB(skb); memset(&tx, 0, sizeof(tx)); __skb_queue_head_init(&tx.skbs); tx.local = local; tx.skb = skb; tx.sdata = vif_to_sdata(info->control.vif); if (txq->sta) { tx.sta = container_of(txq->sta, struct sta_info, sta); /* * Drop unicast frames to unauthorised stations unless they are * injected frames or EAPOL frames from the local station. */ if (unlikely(!(info->flags & IEEE80211_TX_CTL_INJECTED) && ieee80211_is_data(hdr->frame_control) && !ieee80211_vif_is_mesh(&tx.sdata->vif) && tx.sdata->vif.type != NL80211_IFTYPE_OCB && !is_multicast_ether_addr(hdr->addr1) && !test_sta_flag(tx.sta, WLAN_STA_AUTHORIZED) && (!(info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO) || !ether_addr_equal(tx.sdata->vif.addr, hdr->addr2)))) { I802_DEBUG_INC(local->tx_handlers_drop_unauth_port); ieee80211_free_txskb(&local->hw, skb); goto begin; } } /* * The key can be removed while the packet was queued, so need to call * this here to get the current key. */ r = ieee80211_tx_h_select_key(&tx); if (r != TX_CONTINUE) { ieee80211_free_txskb(&local->hw, skb); goto begin; } if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags)) info->flags |= IEEE80211_TX_CTL_AMPDU; else info->flags &= ~IEEE80211_TX_CTL_AMPDU; if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { r = ieee80211_tx_h_rate_ctrl(&tx); if (r != TX_CONTINUE) { ieee80211_free_txskb(&local->hw, skb); goto begin; } } goto encap_out; } if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) { struct sta_info *sta = container_of(txq->sta, struct sta_info, sta); u8 pn_offs = 0; if (tx.key && (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) pn_offs = ieee80211_hdrlen(hdr->frame_control); r = ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs, tx.key, &tx); if (r != TX_CONTINUE) { ieee80211_free_txskb(&local->hw, skb); goto begin; } } else { if (invoke_tx_handlers_late(&tx)) goto begin; skb = __skb_dequeue(&tx.skbs); info = IEEE80211_SKB_CB(skb); if (!skb_queue_empty(&tx.skbs)) { spin_lock_bh(&fq->lock); skb_queue_splice_tail(&tx.skbs, &txqi->frags); spin_unlock_bh(&fq->lock); } } if (skb_has_frag_list(skb) && !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) { if (skb_linearize(skb)) { ieee80211_free_txskb(&local->hw, skb); goto begin; } } switch (tx.sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { vif = &tx.sdata->vif; break; } tx.sdata = rcu_dereference(local->monitor_sdata); if (tx.sdata) { vif = &tx.sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { ieee80211_free_txskb(&local->hw, skb); goto begin; } else { vif = NULL; } break; case NL80211_IFTYPE_AP_VLAN: tx.sdata = container_of(tx.sdata->bss, struct ieee80211_sub_if_data, u.ap); fallthrough; default: vif = &tx.sdata->vif; break; } encap_out: info->control.vif = vif; if (vif && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) { bool ampdu = txq->ac != IEEE80211_AC_VO; u32 airtime; airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta, skb->len, ampdu); if (airtime) { airtime = ieee80211_info_set_tx_time_est(info, airtime); ieee80211_sta_update_pending_airtime(local, tx.sta, txq->ac, airtime, false); } } return skb; out: spin_unlock_bh(&fq->lock); return skb; } EXPORT_SYMBOL(ieee80211_tx_dequeue); struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); struct airtime_sched_info *air_sched; u64 now = ktime_get_boottime_ns(); struct ieee80211_txq *ret = NULL; struct airtime_info *air_info; struct txq_info *txqi = NULL; struct rb_node *node; bool first = false; air_sched = &local->airtime[ac]; spin_lock_bh(&air_sched->lock); node = air_sched->schedule_pos; begin: if (!node) { node = rb_first_cached(&air_sched->active_txqs); first = true; } else { node = rb_next(node); } if (!node) goto out; txqi = container_of(node, struct txq_info, schedule_order); air_info = to_airtime_info(&txqi->txq); if (air_info->v_t > air_sched->v_t && (!first || !airtime_catchup_v_t(air_sched, air_info->v_t, now))) goto out; if (!ieee80211_txq_airtime_check(hw, &txqi->txq)) { first = false; goto begin; } air_sched->schedule_pos = node; air_sched->last_schedule_activity = now; ret = &txqi->txq; out: spin_unlock_bh(&air_sched->lock); return ret; } EXPORT_SYMBOL(ieee80211_next_txq); static void __ieee80211_insert_txq(struct rb_root_cached *root, struct txq_info *txqi) { struct rb_node **new = &root->rb_root.rb_node; struct airtime_info *old_air, *new_air; struct rb_node *parent = NULL; struct txq_info *__txqi; bool leftmost = true; while (*new) { parent = *new; __txqi = rb_entry(parent, struct txq_info, schedule_order); old_air = to_airtime_info(&__txqi->txq); new_air = to_airtime_info(&txqi->txq); if (new_air->v_t <= old_air->v_t) { new = &parent->rb_left; } else { new = &parent->rb_right; leftmost = false; } } rb_link_node(&txqi->schedule_order, parent, new); rb_insert_color_cached(&txqi->schedule_order, root, leftmost); } void ieee80211_resort_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct airtime_info *air_info = to_airtime_info(txq); struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = to_txq_info(txq); struct airtime_sched_info *air_sched; air_sched = &local->airtime[txq->ac]; lockdep_assert_held(&air_sched->lock); if (!RB_EMPTY_NODE(&txqi->schedule_order)) { struct airtime_info *a_prev = NULL, *a_next = NULL; struct txq_info *t_prev, *t_next; struct rb_node *n_prev, *n_next; /* Erasing a node can cause an expensive rebalancing operation, * so we check the previous and next nodes first and only remove * and re-insert if the current node is not already in the * correct position. */ if ((n_prev = rb_prev(&txqi->schedule_order)) != NULL) { t_prev = container_of(n_prev, struct txq_info, schedule_order); a_prev = to_airtime_info(&t_prev->txq); } if ((n_next = rb_next(&txqi->schedule_order)) != NULL) { t_next = container_of(n_next, struct txq_info, schedule_order); a_next = to_airtime_info(&t_next->txq); } if ((!a_prev || a_prev->v_t <= air_info->v_t) && (!a_next || a_next->v_t > air_info->v_t)) return; if (air_sched->schedule_pos == &txqi->schedule_order) air_sched->schedule_pos = n_prev; rb_erase_cached(&txqi->schedule_order, &air_sched->active_txqs); RB_CLEAR_NODE(&txqi->schedule_order); __ieee80211_insert_txq(&air_sched->active_txqs, txqi); } } void ieee80211_update_airtime_weight(struct ieee80211_local *local, struct airtime_sched_info *air_sched, u64 now, bool force) { struct airtime_info *air_info, *tmp; u64 weight_sum = 0; if (unlikely(!now)) now = ktime_get_boottime_ns(); lockdep_assert_held(&air_sched->lock); if (!force && (air_sched->last_weight_update < now - AIRTIME_ACTIVE_DURATION)) return; list_for_each_entry_safe(air_info, tmp, &air_sched->active_list, list) { if (airtime_is_active(air_info, now)) weight_sum += air_info->weight; else list_del_init(&air_info->list); } airtime_weight_sum_set(air_sched, weight_sum); air_sched->last_weight_update = now; } void ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq) __acquires(txq_lock) __releases(txq_lock) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = to_txq_info(txq); struct airtime_sched_info *air_sched; u64 now = ktime_get_boottime_ns(); struct airtime_info *air_info; u8 ac = txq->ac; bool was_active; air_sched = &local->airtime[ac]; air_info = to_airtime_info(txq); spin_lock_bh(&air_sched->lock); was_active = airtime_is_active(air_info, now); airtime_set_active(air_sched, air_info, now); if (!RB_EMPTY_NODE(&txqi->schedule_order)) goto out; /* If the station has been inactive for a while, catch up its v_t so it * doesn't get indefinite priority; see comment above the definition of * AIRTIME_MAX_BEHIND. */ if ((!was_active && air_info->v_t < air_sched->v_t) || air_info->v_t < air_sched->v_t - AIRTIME_MAX_BEHIND) air_info->v_t = air_sched->v_t; ieee80211_update_airtime_weight(local, air_sched, now, !was_active); __ieee80211_insert_txq(&air_sched->active_txqs, txqi); out: spin_unlock_bh(&air_sched->lock); } EXPORT_SYMBOL(ieee80211_schedule_txq); static void __ieee80211_unschedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool purge) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = to_txq_info(txq); struct airtime_sched_info *air_sched; struct airtime_info *air_info; air_sched = &local->airtime[txq->ac]; air_info = to_airtime_info(&txqi->txq); lockdep_assert_held(&air_sched->lock); if (purge) { list_del_init(&air_info->list); ieee80211_update_airtime_weight(local, air_sched, 0, true); } if (RB_EMPTY_NODE(&txqi->schedule_order)) return; if (air_sched->schedule_pos == &txqi->schedule_order) air_sched->schedule_pos = rb_prev(&txqi->schedule_order); if (!purge) airtime_set_active(air_sched, air_info, ktime_get_boottime_ns()); rb_erase_cached(&txqi->schedule_order, &air_sched->active_txqs); RB_CLEAR_NODE(&txqi->schedule_order); } void ieee80211_unschedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool purge) __acquires(txq_lock) __releases(txq_lock) { struct ieee80211_local *local = hw_to_local(hw); spin_lock_bh(&local->airtime[txq->ac].lock); __ieee80211_unschedule_txq(hw, txq, purge); spin_unlock_bh(&local->airtime[txq->ac].lock); } void ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool force) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = to_txq_info(txq); spin_lock_bh(&local->airtime[txq->ac].lock); if (!RB_EMPTY_NODE(&txqi->schedule_order) && !force && !txq_has_queue(txq)) __ieee80211_unschedule_txq(hw, txq, false); spin_unlock_bh(&local->airtime[txq->ac].lock); } EXPORT_SYMBOL(ieee80211_return_txq); DEFINE_STATIC_KEY_FALSE(aql_disable); bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct airtime_info *air_info = to_airtime_info(txq); struct ieee80211_local *local = hw_to_local(hw); if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return true; if (static_branch_unlikely(&aql_disable)) return true; if (!txq->sta) return true; if (unlikely(txq->tid == IEEE80211_NUM_TIDS)) return true; if (atomic_read(&air_info->aql_tx_pending) < air_info->aql_limit_low) return true; if (atomic_read(&local->aql_total_pending_airtime) < local->aql_threshold && atomic_read(&air_info->aql_tx_pending) < air_info->aql_limit_high) return true; return false; } EXPORT_SYMBOL(ieee80211_txq_airtime_check); bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct txq_info *first_txqi = NULL, *txqi = to_txq_info(txq); struct ieee80211_local *local = hw_to_local(hw); struct airtime_sched_info *air_sched; struct airtime_info *air_info; struct rb_node *node = NULL; bool ret = false; u64 now; if (!ieee80211_txq_airtime_check(hw, txq)) return false; air_sched = &local->airtime[txq->ac]; spin_lock_bh(&air_sched->lock); if (RB_EMPTY_NODE(&txqi->schedule_order)) goto out; now = ktime_get_boottime_ns(); /* Like in ieee80211_next_txq(), make sure the first station in the * scheduling order is eligible for transmission to avoid starvation. */ node = rb_first_cached(&air_sched->active_txqs); if (node) { first_txqi = container_of(node, struct txq_info, schedule_order); air_info = to_airtime_info(&first_txqi->txq); if (air_sched->v_t < air_info->v_t) airtime_catchup_v_t(air_sched, air_info->v_t, now); } air_info = to_airtime_info(&txqi->txq); if (air_info->v_t <= air_sched->v_t) { air_sched->last_schedule_activity = now; ret = true; } out: spin_unlock_bh(&air_sched->lock); return ret; } EXPORT_SYMBOL(ieee80211_txq_may_transmit); void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); struct airtime_sched_info *air_sched = &local->airtime[ac]; spin_lock_bh(&air_sched->lock); air_sched->schedule_pos = NULL; spin_unlock_bh(&air_sched->lock); } EXPORT_SYMBOL(ieee80211_txq_schedule_start); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, u32 ctrl_flags, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct sk_buff *next; int len = skb->len; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); return; } rcu_read_lock(); if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) goto out_free; if (IS_ERR(sta)) sta = NULL; if (local->ops->wake_tx_queue) { u16 queue = __ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); skb_get_hash(skb); } ieee80211_aggr_check(sdata, sta, skb); if (sta) { struct ieee80211_fast_tx *fast_tx; sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift); fast_tx = rcu_dereference(sta->fast_tx); if (fast_tx && ieee80211_xmit_fast(sdata, sta, fast_tx, skb)) goto out; } if (skb_is_gso(skb)) { struct sk_buff *segs; segs = skb_gso_segment(skb, 0); if (IS_ERR(segs)) { goto out_free; } else if (segs) { consume_skb(skb); skb = segs; } } else { /* we cannot process non-linear frames on this path */ if (skb_linearize(skb)) goto out_free; /* the frame could be fragmented, software-encrypted, and other * things so we cannot really handle checksum offload with it - * fix it up in software before we handle anything else. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_set_transport_header(skb, skb_checksum_start_offset(skb)); if (skb_checksum_help(skb)) goto out_free; } } skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); if (skb->protocol == sdata->control_port_protocol) ctrl_flags |= IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP; skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, ctrl_flags, cookie); if (IS_ERR(skb)) { kfree_skb_list(next); goto out; } dev_sw_netstats_tx_add(dev, 1, skb->len); ieee80211_xmit(sdata, sta, skb); } goto out; out_free: kfree_skb(skb); len = 0; out: if (len) ieee80211_tpt_led_trig_tx(local, len); rcu_read_unlock(); } static int ieee80211_change_da(struct sk_buff *skb, struct sta_info *sta) { struct ethhdr *eth; int err; err = skb_ensure_writable(skb, ETH_HLEN); if (unlikely(err)) return err; eth = (void *)skb->data; ether_addr_copy(eth->h_dest, sta->sta.addr); return 0; } static bool ieee80211_multicast_to_unicast(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); const struct ethhdr *eth = (void *)skb->data; const struct vlan_ethhdr *ethvlan = (void *)skb->data; __be16 ethertype; if (likely(!is_multicast_ether_addr(eth->h_dest))) return false; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: if (sdata->u.vlan.sta) return false; if (sdata->wdev.use_4addr) return false; fallthrough; case NL80211_IFTYPE_AP: /* check runtime toggle for this bss */ if (!sdata->bss->multicast_to_unicast) return false; break; default: return false; } /* multicast to unicast conversion only for some payload */ ethertype = eth->h_proto; if (ethertype == htons(ETH_P_8021Q) && skb->len >= VLAN_ETH_HLEN) ethertype = ethvlan->h_vlan_encapsulated_proto; switch (ethertype) { case htons(ETH_P_ARP): case htons(ETH_P_IP): case htons(ETH_P_IPV6): break; default: return false; } return true; } static void ieee80211_convert_to_unicast(struct sk_buff *skb, struct net_device *dev, struct sk_buff_head *queue) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; const struct ethhdr *eth = (struct ethhdr *)skb->data; struct sta_info *sta, *first = NULL; struct sk_buff *cloned_skb; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata) /* AP-VLAN mismatch */ continue; if (unlikely(ether_addr_equal(eth->h_source, sta->sta.addr))) /* do not send back to source */ continue; if (!first) { first = sta; continue; } cloned_skb = skb_clone(skb, GFP_ATOMIC); if (!cloned_skb) goto multicast; if (unlikely(ieee80211_change_da(cloned_skb, sta))) { dev_kfree_skb(cloned_skb); goto multicast; } __skb_queue_tail(queue, cloned_skb); } if (likely(first)) { if (unlikely(ieee80211_change_da(skb, first))) goto multicast; __skb_queue_tail(queue, skb); } else { /* no STA connected, drop */ kfree_skb(skb); skb = NULL; } goto out; multicast: __skb_queue_purge(queue); __skb_queue_tail(queue, skb); out: rcu_read_unlock(); } /** * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs * @skb: packet to be sent * @dev: incoming interface * * On failure skb will be freed. */ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) { if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) { struct sk_buff_head queue; __skb_queue_head_init(&queue); ieee80211_convert_to_unicast(skb, dev, &queue); while ((skb = __skb_dequeue(&queue))) __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL); } else { __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL); } return NETDEV_TX_OK; } static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct sta_info *sta, bool txpending) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_control control = {}; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sta *pubsta = NULL; unsigned long flags; int q = info->hw_queue; if (sta) sk_pacing_shift_update(skb->sk, local->hw.tx_sk_pacing_shift); ieee80211_tpt_led_trig_tx(local, skb->len); if (ieee80211_queue_skb(local, sdata, sta, skb)) return true; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { if (txpending) skb_queue_head(&local->pending[q], skb); else skb_queue_tail(&local->pending[q], skb); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return false; } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); if (sta && sta->uploaded) pubsta = &sta->sta; control.sta = pubsta; drv_tx(local, &control, skb); return true; } static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, struct net_device *dev, struct sta_info *sta, struct ieee80211_key *key, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_local *local = sdata->local; struct tid_ampdu_tx *tid_tx; u8 tid; if (local->ops->wake_tx_queue) { u16 queue = __ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); skb_get_hash(skb); } if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) goto out_free; memset(info, 0, sizeof(*info)); ieee80211_aggr_check(sdata, sta, skb); tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (tid_tx) { if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { /* fall back to non-offload slow path */ __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL); return; } info->flags |= IEEE80211_TX_CTL_AMPDU; if (tid_tx->timeout) tid_tx->last_tx = jiffies; } if (unlikely(skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) info->ack_frame_id = ieee80211_store_ack_skb(local, skb, &info->flags, NULL); info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; dev_sw_netstats_tx_add(dev, 1, skb->len); sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); info->flags |= IEEE80211_TX_CTL_HW_80211_ENCAP; info->control.vif = &sdata->vif; if (key) info->control.hw_key = &key->conf; ieee80211_tx_8023(sdata, skb, sta, false); return; out_free: kfree_skb(skb); } netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ethhdr *ehdr = (struct ethhdr *)skb->data; struct ieee80211_key *key; struct sta_info *sta; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); return NETDEV_TX_OK; } rcu_read_lock(); if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { kfree_skb(skb); goto out; } if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || sdata->control_port_protocol == ehdr->h_proto)) goto skip_offload; key = rcu_dereference(sta->ptk[sta->ptk_idx]); if (!key) key = rcu_dereference(sdata->default_unicast_key); if (key && (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) goto skip_offload; ieee80211_8023_xmit(sdata, dev, sta, key, skb); goto out; skip_offload: ieee80211_subif_start_xmit(skb, dev); out: rcu_read_unlock(); return NETDEV_TX_OK; } struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags) { struct ieee80211_hdr *hdr; struct ieee80211_tx_data tx = { .local = sdata->local, .sdata = sdata, }; struct sta_info *sta; rcu_read_lock(); if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { kfree_skb(skb); skb = ERR_PTR(-EINVAL); goto out; } skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0, NULL); if (IS_ERR(skb)) goto out; hdr = (void *)skb->data; tx.sta = sta_info_get(sdata, hdr->addr1); tx.skb = skb; if (ieee80211_tx_h_select_key(&tx) != TX_CONTINUE) { rcu_read_unlock(); kfree_skb(skb); return ERR_PTR(-EINVAL); } out: rcu_read_unlock(); return skb; } /* * ieee80211_clear_tx_pending may not be called in a context where * it is possible that it packets could come in again. */ void ieee80211_clear_tx_pending(struct ieee80211_local *local) { struct sk_buff *skb; int i; for (i = 0; i < local->hw.queues; i++) { while ((skb = skb_dequeue(&local->pending[i])) != NULL) ieee80211_free_txskb(&local->hw, skb); } } /* * Returns false if the frame couldn't be transmitted but was queued instead, * which in this case means re-queued -- take as an indication to stop sending * more pending frames. */ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sub_if_data *sdata; struct sta_info *sta; struct ieee80211_hdr *hdr; bool result; struct ieee80211_chanctx_conf *chanctx_conf; sdata = vif_to_sdata(info->control.vif); if (info->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING) { chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (unlikely(!chanctx_conf)) { dev_kfree_skb(skb); return true; } info->band = chanctx_conf->def.chan->band; result = ieee80211_tx(sdata, NULL, skb, true); } else if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { dev_kfree_skb(skb); return true; } if (IS_ERR(sta) || (sta && !sta->uploaded)) sta = NULL; result = ieee80211_tx_8023(sdata, skb, sta, true); } else { struct sk_buff_head skbs; __skb_queue_head_init(&skbs); __skb_queue_tail(&skbs, skb); hdr = (struct ieee80211_hdr *)skb->data; sta = sta_info_get(sdata, hdr->addr1); result = __ieee80211_tx(local, &skbs, sta, true); } return result; } /* * Transmit all pending packets. Called from tasklet. */ void ieee80211_tx_pending(struct tasklet_struct *t) { struct ieee80211_local *local = from_tasklet(local, t, tx_pending_tasklet); unsigned long flags; int i; bool txok; rcu_read_lock(); spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (i = 0; i < local->hw.queues; i++) { /* * If queue is stopped by something other than due to pending * frames, or we have no pending frames, proceed to next queue. */ if (local->queue_stop_reasons[i] || skb_queue_empty(&local->pending[i])) continue; while (!skb_queue_empty(&local->pending[i])) { struct sk_buff *skb = __skb_dequeue(&local->pending[i]); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); continue; } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); txok = ieee80211_tx_pending_skb(local, skb); spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (!txok) break; } if (skb_queue_empty(&local->pending[i])) ieee80211_propagate_queue_wake(local, i); } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); rcu_read_unlock(); } /* functions for drivers to get certain frames */ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ps_data *ps, struct sk_buff *skb, bool is_template) { u8 *pos, *tim; int aid0 = 0; int i, have_bits = 0, n1, n2; /* Generate bitmap for TIM only if there are any STAs in power save * mode. */ if (atomic_read(&ps->num_sta_ps) > 0) /* in the hope that this is faster than * checking byte-for-byte */ have_bits = !bitmap_empty((unsigned long *)ps->tim, IEEE80211_MAX_AID+1); if (!is_template) { if (ps->dtim_count == 0) ps->dtim_count = sdata->vif.bss_conf.dtim_period - 1; else ps->dtim_count--; } tim = pos = skb_put(skb, 6); *pos++ = WLAN_EID_TIM; *pos++ = 4; *pos++ = ps->dtim_count; *pos++ = sdata->vif.bss_conf.dtim_period; if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf)) aid0 = 1; ps->dtim_bc_mc = aid0 == 1; if (have_bits) { /* Find largest even number N1 so that bits numbered 1 through * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits * (N2 + 1) x 8 through 2007 are 0. */ n1 = 0; for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { if (ps->tim[i]) { n1 = i & 0xfe; break; } } n2 = n1; for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { if (ps->tim[i]) { n2 = i; break; } } /* Bitmap control */ *pos++ = n1 | aid0; /* Part Virt Bitmap */ skb_put(skb, n2 - n1); memcpy(pos, ps->tim + n1, n2 - n1 + 1); tim[1] = n2 - n1 + 4; } else { *pos++ = aid0; /* Bitmap control */ *pos++ = 0; /* Part Virt Bitmap */ } } static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ps_data *ps, struct sk_buff *skb, bool is_template) { struct ieee80211_local *local = sdata->local; /* * Not very nice, but we want to allow the driver to call * ieee80211_beacon_get() as a response to the set_tim() * callback. That, however, is already invoked under the * sta_lock to guarantee consistent and race-free update * of the tim bitmap in mac80211 and the driver. */ if (local->tim_in_locked_section) { __ieee80211_beacon_add_tim(sdata, ps, skb, is_template); } else { spin_lock_bh(&local->tim_lock); __ieee80211_beacon_add_tim(sdata, ps, skb, is_template); spin_unlock_bh(&local->tim_lock); } return 0; } static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon) { u8 *beacon_data, count, max_count = 1; struct probe_resp *resp; size_t beacon_data_len; u16 *bcn_offsets; int i; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: beacon_data = beacon->tail; beacon_data_len = beacon->tail_len; break; case NL80211_IFTYPE_ADHOC: beacon_data = beacon->head; beacon_data_len = beacon->head_len; break; case NL80211_IFTYPE_MESH_POINT: beacon_data = beacon->head; beacon_data_len = beacon->head_len; break; default: return; } rcu_read_lock(); resp = rcu_dereference(sdata->u.ap.probe_resp); bcn_offsets = beacon->cntdwn_counter_offsets; count = beacon->cntdwn_current_counter; if (sdata->vif.csa_active) max_count = IEEE80211_MAX_CNTDWN_COUNTERS_NUM; for (i = 0; i < max_count; ++i) { if (bcn_offsets[i]) { if (WARN_ON_ONCE(bcn_offsets[i] >= beacon_data_len)) { rcu_read_unlock(); return; } beacon_data[bcn_offsets[i]] = count; } if (sdata->vif.type == NL80211_IFTYPE_AP && resp) { u16 *resp_offsets = resp->cntdwn_counter_offsets; resp->data[resp_offsets[i]] = count; } } rcu_read_unlock(); } static u8 __ieee80211_beacon_update_cntdwn(struct beacon_data *beacon) { beacon->cntdwn_current_counter--; /* the counter should never reach 0 */ WARN_ON_ONCE(!beacon->cntdwn_current_counter); return beacon->cntdwn_current_counter; } u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct beacon_data *beacon = NULL; u8 count = 0; rcu_read_lock(); if (sdata->vif.type == NL80211_IFTYPE_AP) beacon = rcu_dereference(sdata->u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) beacon = rcu_dereference(sdata->u.mesh.beacon); if (!beacon) goto unlock; count = __ieee80211_beacon_update_cntdwn(beacon); unlock: rcu_read_unlock(); return count; } EXPORT_SYMBOL(ieee80211_beacon_update_cntdwn); void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct beacon_data *beacon = NULL; rcu_read_lock(); if (sdata->vif.type == NL80211_IFTYPE_AP) beacon = rcu_dereference(sdata->u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) beacon = rcu_dereference(sdata->u.mesh.beacon); if (!beacon) goto unlock; if (counter < beacon->cntdwn_current_counter) beacon->cntdwn_current_counter = counter; unlock: rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_beacon_set_cntdwn); bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct beacon_data *beacon = NULL; u8 *beacon_data; size_t beacon_data_len; int ret = false; if (!ieee80211_sdata_running(sdata)) return false; rcu_read_lock(); if (vif->type == NL80211_IFTYPE_AP) { struct ieee80211_if_ap *ap = &sdata->u.ap; beacon = rcu_dereference(ap->beacon); if (WARN_ON(!beacon || !beacon->tail)) goto out; beacon_data = beacon->tail; beacon_data_len = beacon->tail_len; } else if (vif->type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; beacon = rcu_dereference(ifibss->presp); if (!beacon) goto out; beacon_data = beacon->head; beacon_data_len = beacon->head_len; } else if (vif->type == NL80211_IFTYPE_MESH_POINT) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; beacon = rcu_dereference(ifmsh->beacon); if (!beacon) goto out; beacon_data = beacon->head; beacon_data_len = beacon->head_len; } else { WARN_ON(1); goto out; } if (!beacon->cntdwn_counter_offsets[0]) goto out; if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[0] > beacon_data_len)) goto out; if (beacon_data[beacon->cntdwn_counter_offsets[0]] == 1) ret = true; out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL(ieee80211_beacon_cntdwn_is_complete); static int ieee80211_beacon_protect(struct sk_buff *skb, struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { ieee80211_tx_result res; struct ieee80211_tx_data tx; struct sk_buff *check_skb; memset(&tx, 0, sizeof(tx)); tx.key = rcu_dereference(sdata->default_beacon_key); if (!tx.key) return 0; tx.local = local; tx.sdata = sdata; __skb_queue_head_init(&tx.skbs); __skb_queue_tail(&tx.skbs, skb); res = ieee80211_tx_h_encrypt(&tx); check_skb = __skb_dequeue(&tx.skbs); /* we may crash after this, but it'd be a bug in crypto */ WARN_ON(check_skb != skb); if (WARN_ON_ONCE(res != TX_CONTINUE)) return -EINVAL; return 0; } static struct sk_buff * __ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, bool is_template) { struct ieee80211_local *local = hw_to_local(hw); struct beacon_data *beacon = NULL; struct sk_buff *skb = NULL; struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata = NULL; enum nl80211_band band; struct ieee80211_tx_rate_control txrc; struct ieee80211_chanctx_conf *chanctx_conf; int csa_off_base = 0; rcu_read_lock(); sdata = vif_to_sdata(vif); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!ieee80211_sdata_running(sdata) || !chanctx_conf) goto out; if (offs) memset(offs, 0, sizeof(*offs)); if (sdata->vif.type == NL80211_IFTYPE_AP) { struct ieee80211_if_ap *ap = &sdata->u.ap; beacon = rcu_dereference(ap->beacon); if (beacon) { if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) ieee80211_beacon_update_cntdwn(vif); ieee80211_set_beacon_cntdwn(sdata, beacon); } /* * headroom, head length, * tail length and maximum TIM length */ skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + beacon->tail_len + 256 + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, beacon->head, beacon->head_len); ieee80211_beacon_add_tim(sdata, &ap->ps, skb, is_template); if (offs) { offs->tim_offset = beacon->head_len; offs->tim_length = skb->len - beacon->head_len; offs->cntdwn_counter_offs[0] = beacon->cntdwn_counter_offsets[0]; /* for AP the csa offsets are from tail */ csa_off_base = skb->len; } if (beacon->tail) skb_put_data(skb, beacon->tail, beacon->tail_len); if (ieee80211_beacon_protect(skb, local, sdata) < 0) goto out; } else goto out; } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_hdr *hdr; beacon = rcu_dereference(ifibss->presp); if (!beacon) goto out; if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) __ieee80211_beacon_update_cntdwn(beacon); ieee80211_set_beacon_cntdwn(sdata, beacon); } skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, beacon->head, beacon->head_len); hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); } else if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; beacon = rcu_dereference(ifmsh->beacon); if (!beacon) goto out; if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) /* TODO: For mesh csa_counter is in TU, so * decrementing it by one isn't correct, but * for now we leave it consistent with overall * mac80211's behavior. */ __ieee80211_beacon_update_cntdwn(beacon); ieee80211_set_beacon_cntdwn(sdata, beacon); } if (ifmsh->sync_ops) ifmsh->sync_ops->adjust_tsf(sdata, beacon); skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + 256 + /* TIM IE */ beacon->tail_len + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, beacon->head, beacon->head_len); ieee80211_beacon_add_tim(sdata, &ifmsh->ps, skb, is_template); if (offs) { offs->tim_offset = beacon->head_len; offs->tim_length = skb->len - beacon->head_len; } skb_put_data(skb, beacon->tail, beacon->tail_len); } else { WARN_ON(1); goto out; } /* CSA offsets */ if (offs && beacon) { int i; for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; i++) { u16 csa_off = beacon->cntdwn_counter_offsets[i]; if (!csa_off) continue; offs->cntdwn_counter_offs[i] = csa_off_base + csa_off; } } band = chanctx_conf->def.chan->band; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; info->flags |= IEEE80211_TX_CTL_NO_ACK; info->band = band; memset(&txrc, 0, sizeof(txrc)); txrc.hw = hw; txrc.sband = local->hw.wiphy->bands[band]; txrc.bss_conf = &sdata->vif.bss_conf; txrc.skb = skb; txrc.reported_rate.idx = -1; if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band]) txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band]; else txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; txrc.bss = true; rate_control_get_rate(sdata, NULL, &txrc); info->control.vif = vif; info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_ASSIGN_SEQ | IEEE80211_TX_CTL_FIRST_FRAGMENT; out: rcu_read_unlock(); return skb; } struct sk_buff * ieee80211_beacon_get_template(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs) { return __ieee80211_beacon_get(hw, vif, offs, true); } EXPORT_SYMBOL(ieee80211_beacon_get_template); struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 *tim_offset, u16 *tim_length) { struct ieee80211_mutable_offsets offs = {}; struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false); struct sk_buff *copy; struct ieee80211_supported_band *sband; int shift; if (!bcn) return bcn; if (tim_offset) *tim_offset = offs.tim_offset; if (tim_length) *tim_length = offs.tim_length; if (ieee80211_hw_check(hw, BEACON_TX_STATUS) || !hw_to_local(hw)->monitors) return bcn; /* send a copy to monitor interfaces */ copy = skb_copy(bcn, GFP_ATOMIC); if (!copy) return bcn; shift = ieee80211_vif_get_shift(vif); sband = ieee80211_get_sband(vif_to_sdata(vif)); if (!sband) return bcn; ieee80211_tx_monitor(hw_to_local(hw), copy, sband, 1, shift, false, NULL); return bcn; } EXPORT_SYMBOL(ieee80211_beacon_get_tim); struct sk_buff *ieee80211_proberesp_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ieee80211_if_ap *ap = NULL; struct sk_buff *skb = NULL; struct probe_resp *presp = NULL; struct ieee80211_hdr *hdr; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; rcu_read_lock(); ap = &sdata->u.ap; presp = rcu_dereference(ap->probe_resp); if (!presp) goto out; skb = dev_alloc_skb(presp->len); if (!skb) goto out; skb_put_data(skb, presp->data, presp->len); hdr = (struct ieee80211_hdr *) skb->data; memset(hdr->addr1, 0, sizeof(hdr->addr1)); out: rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_proberesp_get); struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct sk_buff *skb = NULL; struct fils_discovery_data *tmpl = NULL; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; rcu_read_lock(); tmpl = rcu_dereference(sdata->u.ap.fils_discovery); if (!tmpl) { rcu_read_unlock(); return NULL; } skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len); if (skb) { skb_reserve(skb, sdata->local->hw.extra_tx_headroom); skb_put_data(skb, tmpl->data, tmpl->len); } rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_fils_discovery_tmpl); struct sk_buff * ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct sk_buff *skb = NULL; struct unsol_bcast_probe_resp_data *tmpl = NULL; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; rcu_read_lock(); tmpl = rcu_dereference(sdata->u.ap.unsol_bcast_probe_resp); if (!tmpl) { rcu_read_unlock(); return NULL; } skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len); if (skb) { skb_reserve(skb, sdata->local->hw.extra_tx_headroom); skb_put_data(skb, tmpl->data, tmpl->len); } rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_unsol_bcast_probe_resp_tmpl); struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata; struct ieee80211_if_managed *ifmgd; struct ieee80211_pspoll *pspoll; struct ieee80211_local *local; struct sk_buff *skb; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return NULL; sdata = vif_to_sdata(vif); ifmgd = &sdata->u.mgd; local = sdata->local; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*pspoll)); if (!skb) return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); pspoll = skb_put_zero(skb, sizeof(*pspoll)); pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL); pspoll->aid = cpu_to_le16(sdata->vif.bss_conf.aid); /* aid in PS-Poll has its two MSBs each set to 1 */ pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN); memcpy(pspoll->ta, vif->addr, ETH_ALEN); return skb; } EXPORT_SYMBOL(ieee80211_pspoll_get); struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, bool qos_ok) { struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_sub_if_data *sdata; struct ieee80211_if_managed *ifmgd; struct ieee80211_local *local; struct sk_buff *skb; bool qos = false; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return NULL; sdata = vif_to_sdata(vif); ifmgd = &sdata->u.mgd; local = sdata->local; if (qos_ok) { struct sta_info *sta; rcu_read_lock(); sta = sta_info_get(sdata, ifmgd->bssid); qos = sta && sta->sta.wme; rcu_read_unlock(); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc) + 2); if (!skb) return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put_zero(skb, sizeof(*nullfunc)); nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_TODS); if (qos) { __le16 qoshdr = cpu_to_le16(7); BUILD_BUG_ON((IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_STYPE_NULLFUNC) != IEEE80211_STYPE_QOS_NULLFUNC); nullfunc->frame_control |= cpu_to_le16(IEEE80211_STYPE_QOS_NULLFUNC); skb->priority = 7; skb_set_queue_mapping(skb, IEEE80211_AC_VO); skb_put_data(skb, &qoshdr, sizeof(qoshdr)); } memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN); memcpy(nullfunc->addr2, vif->addr, ETH_ALEN); memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN); return skb; } EXPORT_SYMBOL(ieee80211_nullfunc_get); struct sk_buff *ieee80211_probereq_get(struct ieee80211_hw *hw, const u8 *src_addr, const u8 *ssid, size_t ssid_len, size_t tailroom) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_hdr_3addr *hdr; struct sk_buff *skb; size_t ie_ssid_len; u8 *pos; ie_ssid_len = 2 + ssid_len; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*hdr) + ie_ssid_len + tailroom); if (!skb) return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); hdr = skb_put_zero(skb, sizeof(*hdr)); hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ); eth_broadcast_addr(hdr->addr1); memcpy(hdr->addr2, src_addr, ETH_ALEN); eth_broadcast_addr(hdr->addr3); pos = skb_put(skb, ie_ssid_len); *pos++ = WLAN_EID_SSID; *pos++ = ssid_len; if (ssid_len) memcpy(pos, ssid, ssid_len); pos += ssid_len; return skb; } EXPORT_SYMBOL(ieee80211_probereq_get); void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_rts *rts) { const struct ieee80211_hdr *hdr = frame; rts->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS); rts->duration = ieee80211_rts_duration(hw, vif, frame_len, frame_txctl); memcpy(rts->ra, hdr->addr1, sizeof(rts->ra)); memcpy(rts->ta, hdr->addr2, sizeof(rts->ta)); } EXPORT_SYMBOL(ieee80211_rts_get); void ieee80211_ctstoself_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_cts *cts) { const struct ieee80211_hdr *hdr = frame; cts->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTS); cts->duration = ieee80211_ctstoself_duration(hw, vif, frame_len, frame_txctl); memcpy(cts->ra, hdr->addr1, sizeof(cts->ra)); } EXPORT_SYMBOL(ieee80211_ctstoself_get); struct sk_buff * ieee80211_get_buffered_bc(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ieee80211_local *local = hw_to_local(hw); struct sk_buff *skb = NULL; struct ieee80211_tx_data tx; struct ieee80211_sub_if_data *sdata; struct ps_data *ps; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; sdata = vif_to_sdata(vif); rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (!chanctx_conf) goto out; if (sdata->vif.type == NL80211_IFTYPE_AP) { struct beacon_data *beacon = rcu_dereference(sdata->u.ap.beacon); if (!beacon || !beacon->head) goto out; ps = &sdata->u.ap.ps; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { ps = &sdata->u.mesh.ps; } else { goto out; } if (ps->dtim_count != 0 || !ps->dtim_bc_mc) goto out; /* send buffered bc/mc only after DTIM beacon */ while (1) { skb = skb_dequeue(&ps->bc_buf); if (!skb) goto out; local->total_ps_buffered--; if (!skb_queue_empty(&ps->bc_buf) && skb->len >= 2) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; /* more buffered multicast/broadcast frames ==> set * MoreData flag in IEEE 802.11 header to inform PS * STAs */ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); } if (sdata->vif.type == NL80211_IFTYPE_AP) sdata = IEEE80211_DEV_TO_SUB_IF(skb->dev); if (!ieee80211_tx_prepare(sdata, &tx, NULL, skb)) break; ieee80211_free_txskb(hw, skb); } info = IEEE80211_SKB_CB(skb); tx.flags |= IEEE80211_TX_PS_BUFFERED; info->band = chanctx_conf->def.chan->band; if (invoke_tx_handlers(&tx)) skb = NULL; out: rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_buffered_bc); int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; int ret; u32 queues; lockdep_assert_held(&local->sta_mtx); /* only some cases are supported right now */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: break; default: WARN_ON(1); return -EINVAL; } if (WARN_ON(tid >= IEEE80211_NUM_UPS)) return -EINVAL; if (sta->reserved_tid == tid) { ret = 0; goto out; } if (sta->reserved_tid != IEEE80211_TID_UNRESERVED) { sdata_err(sdata, "TID reservation already active\n"); ret = -EALREADY; goto out; } ieee80211_stop_vif_queues(sdata->local, sdata, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID); synchronize_net(); /* Tear down BA sessions so we stop aggregating on this TID */ if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_LOCAL_REQUEST); } queues = BIT(sdata->vif.hw_queue[ieee802_1d_to_ac[tid]]); __ieee80211_flush_queues(local, sdata, queues, false); sta->reserved_tid = tid; ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID); if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ret = 0; out: return ret; } EXPORT_SYMBOL(ieee80211_reserve_tid); void ieee80211_unreserve_tid(struct ieee80211_sta *pubsta, u8 tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sub_if_data *sdata = sta->sdata; lockdep_assert_held(&sdata->local->sta_mtx); /* only some cases are supported right now */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: break; default: WARN_ON(1); return; } if (tid != sta->reserved_tid) { sdata_err(sdata, "TID to unreserve (%d) isn't reserved\n", tid); return; } sta->reserved_tid = IEEE80211_TID_UNRESERVED; } EXPORT_SYMBOL(ieee80211_unreserve_tid); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, enum nl80211_band band) { int ac = ieee80211_ac_from_tid(tid); skb_reset_mac_header(skb); skb_set_queue_mapping(skb, ac); skb->priority = tid; skb->dev = sdata->dev; /* * The other path calling ieee80211_xmit is from the tasklet, * and while we can handle concurrent transmissions locking * requirements are that we do not come into tx with bhs on. */ local_bh_disable(); IEEE80211_SKB_CB(skb)->band = band; ieee80211_xmit(sdata, NULL, skb); local_bh_enable(); } int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct sk_buff *skb; struct ethhdr *ehdr; u32 ctrl_flags = 0; u32 flags = 0; /* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE * or Pre-Authentication */ if (proto != sdata->control_port_protocol && proto != cpu_to_be16(ETH_P_PREAUTH)) return -EINVAL; if (proto == sdata->control_port_protocol) ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO | IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP; if (unencrypted) flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; if (cookie) ctrl_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; flags |= IEEE80211_TX_INTFL_NL80211_FRAME_TX; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(struct ethhdr) + len); if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom + sizeof(struct ethhdr)); skb_put_data(skb, buf, len); ehdr = skb_push(skb, sizeof(struct ethhdr)); memcpy(ehdr->h_dest, dest, ETH_ALEN); memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN); ehdr->h_proto = proto; skb->dev = dev; skb->protocol = proto; skb_reset_network_header(skb); skb_reset_mac_header(skb); if (local->hw.queues < IEEE80211_NUM_ACS) goto start_xmit; /* update QoS header to prioritize control port frames if possible, * priorization also happens for control port frames send over * AF_PACKET */ rcu_read_lock(); if (ieee80211_lookup_ra_sta(sdata, skb, &sta) == 0 && !IS_ERR(sta)) { u16 queue = __ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); skb_get_hash(skb); } rcu_read_unlock(); start_xmit: /* mutex lock is only needed for incrementing the cookie counter */ mutex_lock(&local->mtx); local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags, cookie); local_bh_enable(); mutex_unlock(&local->mtx); return 0; } int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; skb = dev_alloc_skb(local->hw.extra_tx_headroom + len + 30 + /* header size */ 18); /* 11s header size */ if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); skb_put_data(skb, buf, len); skb->dev = dev; skb->protocol = htons(ETH_P_802_3); skb_reset_network_header(skb); skb_reset_mac_header(skb); local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, 0, IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP, NULL); local_bh_enable(); return 0; } |
429 147 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FIB_LOOKUP_H #define _FIB_LOOKUP_H #include <linux/types.h> #include <linux/list.h> #include <net/ip_fib.h> #include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; struct fib_info *fa_info; u8 fa_tos; u8 fa_type; u8 fa_state; u8 fa_slen; u32 tb_id; s16 fa_default; u8 offload; u8 trap; u8 offload_failed; struct rcu_head rcu; }; #define FA_S_ACCESSED 0x01 /* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) fa->fa_state |= FA_S_ACCESSED; } /* Exported by fib_semantics.c */ void fib_release_info(struct fib_info *); struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, const struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); size_t fib_nlmsg_size(struct fib_info *fi); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; res->nhc = fib_info_nhc(fi, 0); } struct fib_prop { int error; u8 scope; }; extern const struct fib_prop fib_props[RTN_MAX + 1]; #endif /* _FIB_LOOKUP_H */ |
492 47 556 558 559 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_BL_H #define _LINUX_LIST_BL_H #include <linux/list.h> #include <linux/bit_spinlock.h> /* * Special version of lists, where head of the list has a lock in the lowest * bit. This is useful for scalable hash tables without increasing memory * footprint overhead. * * For modification operations, the 0 bit of hlist_bl_head->first * pointer must be set. * * With some small modifications, this can easily be adapted to store several * arbitrary bits (not just a single lock bit), if the need arises to store * some fast and compact auxiliary data. */ #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define LIST_BL_LOCKMASK 1UL #else #define LIST_BL_LOCKMASK 0UL #endif #ifdef CONFIG_DEBUG_LIST #define LIST_BL_BUG_ON(x) BUG_ON(x) #else #define LIST_BL_BUG_ON(x) #endif struct hlist_bl_head { struct hlist_bl_node *first; }; struct hlist_bl_node { struct hlist_bl_node *next, **pprev; }; #define INIT_HLIST_BL_HEAD(ptr) \ ((ptr)->first = NULL) static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) { h->next = NULL; h->pprev = NULL; } #define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member) static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h) { return !h->pprev; } static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)h->first & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_set_first(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); } static inline bool hlist_bl_empty(const struct hlist_bl_head *h) { return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_add_head(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; hlist_bl_set_first(h, n); } static inline void hlist_bl_add_before(struct hlist_bl_node *n, struct hlist_bl_node *next) { struct hlist_bl_node **pprev = next->pprev; n->pprev = pprev; n->next = next; next->pprev = &n->next; /* pprev may be `first`, so be careful not to lose the lock bit */ WRITE_ONCE(*pprev, (struct hlist_bl_node *) ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK))); } static inline void hlist_bl_add_behind(struct hlist_bl_node *n, struct hlist_bl_node *prev) { n->next = prev->next; n->pprev = &prev->next; prev->next = n; if (n->next) n->next->pprev = &n->next; } static inline void __hlist_bl_del(struct hlist_bl_node *n) { struct hlist_bl_node *next = n->next; struct hlist_bl_node **pprev = n->pprev; LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); /* pprev may be `first`, so be careful not to lose the lock bit */ WRITE_ONCE(*pprev, (struct hlist_bl_node *) ((unsigned long)next | ((unsigned long)*pprev & LIST_BL_LOCKMASK))); if (next) next->pprev = pprev; } static inline void hlist_bl_del(struct hlist_bl_node *n) { __hlist_bl_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } static inline void hlist_bl_del_init(struct hlist_bl_node *n) { if (!hlist_bl_unhashed(n)) { __hlist_bl_del(n); INIT_HLIST_BL_NODE(n); } } static inline void hlist_bl_lock(struct hlist_bl_head *b) { bit_spin_lock(0, (unsigned long *)b); } static inline void hlist_bl_unlock(struct hlist_bl_head *b) { __bit_spin_unlock(0, (unsigned long *)b); } static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) { return bit_spin_is_locked(0, (unsigned long *)b); } /** * hlist_bl_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * */ #define hlist_bl_for_each_entry(tpos, pos, head, member) \ for (pos = hlist_bl_first(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @n: another &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member) \ for (pos = hlist_bl_first(head); \ pos && ({ n = pos->next; 1; }) && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ pos = n) #endif |
7 7 7 3 1 2 8 1 7 7 7 7 7 7 15 15 5 10 5 8 8 8 1 1 3 3 3 14 1 1 12 7 5 5 5 5 4 5 1 1 7 7 7 7 7 7 7 7 15 15 7 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/uaccess.h> #include <linux/debugfs.h> #include <linux/caif/caif_socket.h> #include <linux/pkt_sched.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/caif/caif_layer.h> #include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_CAIF); /* * CAIF state is re-using the TCP socket states. * caif_states stored in sk_state reflect the state as reported by * the CAIF stack, while sk_socket->state is the state of the socket. */ enum caif_states { CAIF_CONNECTED = TCP_ESTABLISHED, CAIF_CONNECTING = TCP_SYN_SENT, CAIF_DISCONNECTED = TCP_CLOSE }; #define TX_FLOW_ON_BIT 1 #define RX_FLOW_ON_BIT 2 struct caifsock { struct sock sk; /* must be first member */ struct cflayer layer; u32 flow_state; struct caif_connect_request conn_req; struct mutex readlock; struct dentry *debugfs_socket_dir; int headroom, tailroom, maxframe; }; static int rx_flow_is_on(struct caifsock *cf_sk) { return test_bit(RX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static int tx_flow_is_on(struct caifsock *cf_sk) { return test_bit(TX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static void set_rx_flow_off(struct caifsock *cf_sk) { clear_bit(RX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static void set_rx_flow_on(struct caifsock *cf_sk) { set_bit(RX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static void set_tx_flow_off(struct caifsock *cf_sk) { clear_bit(TX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static void set_tx_flow_on(struct caifsock *cf_sk) { set_bit(TX_FLOW_ON_BIT, (void *) &cf_sk->flow_state); } static void caif_read_lock(struct sock *sk) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); mutex_lock(&cf_sk->readlock); } static void caif_read_unlock(struct sock *sk) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); mutex_unlock(&cf_sk->readlock); } static int sk_rcvbuf_lowwater(struct caifsock *cf_sk) { /* A quarter of full buffer is used a low water mark */ return cf_sk->sk.sk_rcvbuf / 4; } static void caif_flow_ctrl(struct sock *sk, int mode) { struct caifsock *cf_sk; cf_sk = container_of(sk, struct caifsock, sk); if (cf_sk->layer.dn && cf_sk->layer.dn->modemcmd) cf_sk->layer.dn->modemcmd(cf_sk->layer.dn, mode); } /* * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are * not dropped, but CAIF is sending flow off instead. */ static void caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); bool queued = false; if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { net_dbg_ratelimited("sending flow OFF (queue len = %d %d)\n", atomic_read(&cf_sk->sk.sk_rmem_alloc), sk_rcvbuf_lowwater(cf_sk)); set_rx_flow_off(cf_sk); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); } err = sk_filter(sk, skb); if (err) goto out; if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); } skb->dev = NULL; skb_set_owner_r(skb, sk); spin_lock_irqsave(&list->lock, flags); queued = !sock_flag(sk, SOCK_DEAD); if (queued) __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); out: if (queued) sk->sk_data_ready(sk); else kfree_skb(skb); } /* Packet Receive Callback function called from CAIF Stack */ static int caif_sktrecv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct caifsock *cf_sk; struct sk_buff *skb; cf_sk = container_of(layr, struct caifsock, layer); skb = cfpkt_tonative(pkt); if (unlikely(cf_sk->sk.sk_state != CAIF_CONNECTED)) { kfree_skb(skb); return 0; } caif_queue_rcv_skb(&cf_sk->sk, skb); return 0; } static void cfsk_hold(struct cflayer *layr) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); sock_hold(&cf_sk->sk); } static void cfsk_put(struct cflayer *layr) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); sock_put(&cf_sk->sk); } /* Packet Control Callback function called from CAIF */ static void caif_ctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, int phyid) { struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); switch (flow) { case CAIF_CTRLCMD_FLOW_ON_IND: /* OK from modem to start sending again */ set_tx_flow_on(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_FLOW_OFF_IND: /* Modem asks us to shut up */ set_tx_flow_off(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_INIT_RSP: /* We're now connected */ caif_client_register_refcnt(&cf_sk->layer, cfsk_hold, cfsk_put); cf_sk->sk.sk_state = CAIF_CONNECTED; set_tx_flow_on(cf_sk); cf_sk->sk.sk_shutdown = 0; cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_DEINIT_RSP: /* We're now disconnected */ cf_sk->sk.sk_state = CAIF_DISCONNECTED; cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_INIT_FAIL_RSP: /* Connect request failed */ cf_sk->sk.sk_err = ECONNREFUSED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; /* * Socket "standards" seems to require POLLOUT to * be set at connect failure. */ set_tx_flow_on(cf_sk); cf_sk->sk.sk_state_change(&cf_sk->sk); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: /* Modem has closed this connection, or device is down. */ cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; cf_sk->sk.sk_err = ECONNRESET; set_rx_flow_on(cf_sk); sk_error_report(&cf_sk->sk); break; default: pr_debug("Unexpected flow command %d\n", flow); } } static void caif_check_flow_release(struct sock *sk) { struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); if (rx_flow_is_on(cf_sk)) return; if (atomic_read(&sk->sk_rmem_alloc) <= sk_rcvbuf_lowwater(cf_sk)) { set_rx_flow_on(cf_sk); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_ON_REQ); } } /* * Copied from unix_dgram_recvmsg, but removed credit checks, * changed locking, address handling and added MSG_TRUNC. */ static int caif_seqpkt_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int ret; int copylen; ret = -EOPNOTSUPP; if (flags & MSG_OOB) goto read_error; skb = skb_recv_datagram(sk, flags, 0 , &ret); if (!skb) goto read_error; copylen = skb->len; if (len < copylen) { m->msg_flags |= MSG_TRUNC; copylen = len; } ret = skb_copy_datagram_msg(skb, 0, m, copylen); if (ret) goto out_free; ret = (flags & MSG_TRUNC) ? skb->len : copylen; out_free: skb_free_datagram(sk, skb); caif_check_flow_release(sk); return ret; read_error: return ret; } /* Copied from unix_stream_wait_data, identical except for lock call. */ static long caif_stream_data_wait(struct sock *sk, long timeo) { DEFINE_WAIT(wait); lock_sock(sk); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!skb_queue_empty(&sk->sk_receive_queue) || sk->sk_err || sk->sk_state != CAIF_CONNECTED || sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) break; sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); release_sock(sk); return timeo; } /* * Copied from unix_stream_recvmsg, but removed credit checks, * changed locking calls, changed address handling. */ static int caif_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int copied = 0; int target; int err = 0; long timeo; err = -EOPNOTSUPP; if (flags&MSG_OOB) goto out; /* * Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ err = -EAGAIN; if (sk->sk_state == CAIF_CONNECTING) goto out; caif_read_lock(sk); target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT); do { int chunk; struct sk_buff *skb; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; goto unlock; } skb = skb_dequeue(&sk->sk_receive_queue); caif_check_flow_release(sk); if (skb == NULL) { if (copied >= target) goto unlock; /* * POSIX 1003.1g mandates this order. */ err = sock_error(sk); if (err) goto unlock; err = -ECONNRESET; if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; err = -EPIPE; if (sk->sk_state != CAIF_CONNECTED) goto unlock; if (sock_flag(sk, SOCK_DEAD)) goto unlock; release_sock(sk); err = -EAGAIN; if (!timeo) break; caif_read_unlock(sk); timeo = caif_stream_data_wait(sk, timeo); if (signal_pending(current)) { err = sock_intr_errno(timeo); goto out; } caif_read_lock(sk); continue; unlock: release_sock(sk); break; } release_sock(sk); chunk = min_t(unsigned int, skb->len, size); if (memcpy_to_msg(msg, skb->data, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (copied == 0) copied = -EFAULT; break; } copied += chunk; size -= chunk; /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { skb_pull(skb, chunk); /* put the skb back if we didn't use it up. */ if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); break; } kfree_skb(skb); } else { /* * It is questionable, see note in unix_dgram_recvmsg. */ /* put message back and return */ skb_queue_head(&sk->sk_receive_queue, skb); break; } } while (size); caif_read_unlock(sk); out: return copied ? : err; } /* * Copied from sock.c:sock_wait_for_wmem, but change to wait for * CAIF flow-on and sock_writable. */ static long caif_wait_for_flow_on(struct caifsock *cf_sk, int wait_writeable, long timeo, int *err) { struct sock *sk = &cf_sk->sk; DEFINE_WAIT(wait); for (;;) { *err = 0; if (tx_flow_is_on(cf_sk) && (!wait_writeable || sock_writeable(&cf_sk->sk))) break; *err = -ETIMEDOUT; if (!timeo) break; *err = -ERESTARTSYS; if (signal_pending(current)) break; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); *err = -ECONNRESET; if (sk->sk_shutdown & SHUTDOWN_MASK) break; *err = -sk->sk_err; if (sk->sk_err) break; *err = -EPIPE; if (cf_sk->sk.sk_state != CAIF_CONNECTED) break; timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(sk), &wait); return timeo; } /* * Transmit a SKB. The device may temporarily request re-transmission * by returning EAGAIN. */ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, int noblock, long timeo) { struct cfpkt *pkt; pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb); memset(skb->cb, 0, sizeof(struct caif_payload_info)); cfpkt_set_prio(pkt, cf_sk->sk.sk_priority); if (cf_sk->layer.dn == NULL) { kfree_skb(skb); return -EINVAL; } return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt); } /* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */ static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int buffer_size; int ret = 0; struct sk_buff *skb = NULL; int noblock; long timeo; caif_assert(cf_sk); ret = sock_error(sk); if (ret) goto err; ret = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto err; ret = -EOPNOTSUPP; if (msg->msg_namelen) goto err; ret = -EINVAL; if (unlikely(msg->msg_iter.nr_segs == 0) || unlikely(msg->msg_iter.iov->iov_base == NULL)) goto err; noblock = msg->msg_flags & MSG_DONTWAIT; timeo = sock_sndtimeo(sk, noblock); timeo = caif_wait_for_flow_on(container_of(sk, struct caifsock, sk), 1, timeo, &ret); if (ret) goto err; ret = -EPIPE; if (cf_sk->sk.sk_state != CAIF_CONNECTED || sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) goto err; /* Error if trying to write more than maximum frame size. */ ret = -EMSGSIZE; if (len > cf_sk->maxframe && cf_sk->sk.sk_protocol != CAIFPROTO_RFM) goto err; buffer_size = len + cf_sk->headroom + cf_sk->tailroom; ret = -ENOMEM; skb = sock_alloc_send_skb(sk, buffer_size, noblock, &ret); if (!skb || skb_tailroom(skb) < buffer_size) goto err; skb_reserve(skb, cf_sk->headroom); ret = memcpy_from_msg(skb_put(skb, len), msg, len); if (ret) goto err; ret = transmit_skb(skb, cf_sk, noblock, timeo); if (ret < 0) /* skb is already freed */ return ret; return len; err: kfree_skb(skb); return ret; } /* * Copied from unix_stream_sendmsg and adapted to CAIF: * Changed removed permission handling and added waiting for flow on * and other minor adaptations. */ static int caif_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int err, size; struct sk_buff *skb; int sent = 0; long timeo; err = -EOPNOTSUPP; if (unlikely(msg->msg_flags&MSG_OOB)) goto out_err; if (unlikely(msg->msg_namelen)) goto out_err; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); timeo = caif_wait_for_flow_on(cf_sk, 1, timeo, &err); if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) goto pipe_err; while (sent < len) { size = len-sent; if (size > cf_sk->maxframe) size = cf_sk->maxframe; /* If size is more than half of sndbuf, chop up message */ if (size > ((sk->sk_sndbuf >> 1) - 64)) size = (sk->sk_sndbuf >> 1) - 64; if (size > SKB_MAX_ALLOC) size = SKB_MAX_ALLOC; skb = sock_alloc_send_skb(sk, size + cf_sk->headroom + cf_sk->tailroom, msg->msg_flags&MSG_DONTWAIT, &err); if (skb == NULL) goto out_err; skb_reserve(skb, cf_sk->headroom); /* * If you pass two values to the sock_alloc_send_skb * it tries to grab the large buffer with GFP_NOFS * (which can fail easily), and if it fails grab the * fallback size buffer which is under a page and will * succeed. [Alan] */ size = min_t(int, size, skb_tailroom(skb)); err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err) { kfree_skb(skb); goto out_err; } err = transmit_skb(skb, cf_sk, msg->msg_flags&MSG_DONTWAIT, timeo); if (err < 0) /* skb is already freed */ goto pipe_err; sent += size; } return sent; pipe_err: if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; out_err: return sent ? : err; } static int setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov, unsigned int ol) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); int linksel; if (cf_sk->sk.sk_socket->state != SS_UNCONNECTED) return -ENOPROTOOPT; switch (opt) { case CAIFSO_LINK_SELECT: if (ol < sizeof(int)) return -EINVAL; if (lvl != SOL_CAIF) goto bad_sol; if (copy_from_sockptr(&linksel, ov, sizeof(int))) return -EINVAL; lock_sock(&(cf_sk->sk)); cf_sk->conn_req.link_selector = linksel; release_sock(&cf_sk->sk); return 0; case CAIFSO_REQ_PARAM: if (lvl != SOL_CAIF) goto bad_sol; if (cf_sk->sk.sk_protocol != CAIFPROTO_UTIL) return -ENOPROTOOPT; lock_sock(&(cf_sk->sk)); if (ol > sizeof(cf_sk->conn_req.param.data) || copy_from_sockptr(&cf_sk->conn_req.param.data, ov, ol)) { release_sock(&cf_sk->sk); return -EINVAL; } cf_sk->conn_req.param.size = ol; release_sock(&cf_sk->sk); return 0; default: return -ENOPROTOOPT; } return 0; bad_sol: return -ENOPROTOOPT; } /* * caif_connect() - Connect a CAIF Socket * Copied and modified af_irda.c:irda_connect(). * * Note : by consulting "errno", the user space caller may learn the cause * of the failure. Most of them are visible in the function, others may come * from subroutines called and are listed here : * o -EAFNOSUPPORT: bad socket family or type. * o -ESOCKTNOSUPPORT: bad socket type or protocol * o -EINVAL: bad socket address, or CAIF link type * o -ECONNREFUSED: remote end refused the connection. * o -EINPROGRESS: connect request sent but timed out (or non-blocking) * o -EISCONN: already connected. * o -ETIMEDOUT: Connection timed out (send timeout) * o -ENODEV: No link layer to send request * o -ECONNRESET: Received Shutdown indication or lost link layer * o -ENOMEM: Out of memory * * State Strategy: * o sk_state: holds the CAIF_* protocol state, it's updated by * caif_ctrl_cb. * o sock->state: holds the SS_* socket state and is updated by connect and * disconnect. */ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); long timeo; int err; int ifindex, headroom, tailroom; unsigned int mtu; struct net_device *dev; lock_sock(sk); err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) goto out; err = -EAFNOSUPPORT; if (uaddr->sa_family != AF_CAIF) goto out; switch (sock->state) { case SS_UNCONNECTED: /* Normal case, a fresh connect */ caif_assert(sk->sk_state == CAIF_DISCONNECTED); break; case SS_CONNECTING: switch (sk->sk_state) { case CAIF_CONNECTED: sock->state = SS_CONNECTED; err = -EISCONN; goto out; case CAIF_DISCONNECTED: /* Reconnect allowed */ break; case CAIF_CONNECTING: err = -EALREADY; if (flags & O_NONBLOCK) goto out; goto wait_connect; } break; case SS_CONNECTED: caif_assert(sk->sk_state == CAIF_CONNECTED || sk->sk_state == CAIF_DISCONNECTED); if (sk->sk_shutdown & SHUTDOWN_MASK) { /* Allow re-connect after SHUTDOWN_IND */ caif_disconnect_client(sock_net(sk), &cf_sk->layer); caif_free_client(&cf_sk->layer); break; } /* No reconnect on a seqpacket socket */ err = -EISCONN; goto out; case SS_DISCONNECTING: case SS_FREE: caif_assert(1); /*Should never happen */ break; } sk->sk_state = CAIF_DISCONNECTED; sock->state = SS_UNCONNECTED; sk_stream_kill_queues(&cf_sk->sk); err = -EINVAL; if (addr_len != sizeof(struct sockaddr_caif)) goto out; memcpy(&cf_sk->conn_req.sockaddr, uaddr, sizeof(struct sockaddr_caif)); /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = CAIF_CONNECTING; /* Check priority value comming from socket */ /* if priority value is out of range it will be ajusted */ if (cf_sk->sk.sk_priority > CAIF_PRIO_MAX) cf_sk->conn_req.priority = CAIF_PRIO_MAX; else if (cf_sk->sk.sk_priority < CAIF_PRIO_MIN) cf_sk->conn_req.priority = CAIF_PRIO_MIN; else cf_sk->conn_req.priority = cf_sk->sk.sk_priority; /*ifindex = id of the interface.*/ cf_sk->conn_req.ifindex = cf_sk->sk.sk_bound_dev_if; cf_sk->layer.receive = caif_sktrecv_cb; err = caif_connect_client(sock_net(sk), &cf_sk->conn_req, &cf_sk->layer, &ifindex, &headroom, &tailroom); if (err < 0) { cf_sk->sk.sk_socket->state = SS_UNCONNECTED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; goto out; } err = -ENODEV; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { rcu_read_unlock(); goto out; } cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom); mtu = dev->mtu; rcu_read_unlock(); cf_sk->tailroom = tailroom; cf_sk->maxframe = mtu - (headroom + tailroom); if (cf_sk->maxframe < 1) { pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu); err = -ENODEV; goto out; } err = -EINPROGRESS; wait_connect: if (sk->sk_state != CAIF_CONNECTED && (flags & O_NONBLOCK)) goto out; timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); release_sock(sk); err = -ERESTARTSYS; timeo = wait_event_interruptible_timeout(*sk_sleep(sk), sk->sk_state != CAIF_CONNECTING, timeo); lock_sock(sk); if (timeo < 0) goto out; /* -ERESTARTSYS */ err = -ETIMEDOUT; if (timeo == 0 && sk->sk_state != CAIF_CONNECTED) goto out; if (sk->sk_state != CAIF_CONNECTED) { sock->state = SS_UNCONNECTED; err = sock_error(sk); if (!err) err = -ECONNREFUSED; goto out; } sock->state = SS_CONNECTED; err = 0; out: release_sock(sk); return err; } /* * caif_release() - Disconnect a CAIF Socket * Copied and modified af_irda.c:irda_release(). */ static int caif_release(struct socket *sock) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); if (!sk) return 0; set_tx_flow_off(cf_sk); /* * Ensure that packets are not queued after this point in time. * caif_queue_rcv_skb checks SOCK_DEAD holding the queue lock, * this ensures no packets when sock is dead. */ spin_lock_bh(&sk->sk_receive_queue.lock); sock_set_flag(sk, SOCK_DEAD); spin_unlock_bh(&sk->sk_receive_queue.lock); sock->sk = NULL; WARN_ON(IS_ERR(cf_sk->debugfs_socket_dir)); debugfs_remove_recursive(cf_sk->debugfs_socket_dir); lock_sock(&(cf_sk->sk)); sk->sk_state = CAIF_DISCONNECTED; sk->sk_shutdown = SHUTDOWN_MASK; caif_disconnect_client(sock_net(sk), &cf_sk->layer); cf_sk->sk.sk_socket->state = SS_DISCONNECTING; wake_up_interruptible_poll(sk_sleep(sk), EPOLLERR|EPOLLHUP); sock_orphan(sk); sk_stream_kill_queues(&cf_sk->sk); release_sock(sk); sock_put(sk); return 0; } /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ static __poll_t caif_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ if (sk->sk_err) mask |= EPOLLERR; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) mask |= EPOLLIN | EPOLLRDNORM; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ if (sock_writeable(sk) && tx_flow_is_on(cf_sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } static const struct proto_ops caif_seqpacket_ops = { .family = PF_CAIF, .owner = THIS_MODULE, .release = caif_release, .bind = sock_no_bind, .connect = caif_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, .sendmsg = caif_seqpkt_sendmsg, .recvmsg = caif_seqpkt_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; static const struct proto_ops caif_stream_ops = { .family = PF_CAIF, .owner = THIS_MODULE, .release = caif_release, .bind = sock_no_bind, .connect = caif_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, .sendmsg = caif_stream_sendmsg, .recvmsg = caif_stream_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; /* This function is called when a socket is finally destroyed. */ static void caif_sock_destructor(struct sock *sk) { struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); caif_assert(!refcount_read(&sk->sk_wmem_alloc)); caif_assert(sk_unhashed(sk)); caif_assert(!sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_debug("Attempt to release alive CAIF socket: %p\n", sk); return; } sk_stream_kill_queues(&cf_sk->sk); WARN_ON(sk->sk_forward_alloc); caif_free_client(&cf_sk->layer); } static int caif_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk = NULL; struct caifsock *cf_sk = NULL; static struct proto prot = {.name = "PF_CAIF", .owner = THIS_MODULE, .obj_size = sizeof(struct caifsock), .useroffset = offsetof(struct caifsock, conn_req.param), .usersize = sizeof_field(struct caifsock, conn_req.param) }; if (!capable(CAP_SYS_ADMIN) && !capable(CAP_NET_ADMIN)) return -EPERM; /* * The sock->type specifies the socket type to use. * The CAIF socket is a packet stream in the sense * that it is packet based. CAIF trusts the reliability * of the link, no resending is implemented. */ if (sock->type == SOCK_SEQPACKET) sock->ops = &caif_seqpacket_ops; else if (sock->type == SOCK_STREAM) sock->ops = &caif_stream_ops; else return -ESOCKTNOSUPPORT; if (protocol < 0 || protocol >= CAIFPROTO_MAX) return -EPROTONOSUPPORT; /* * Set the socket state to unconnected. The socket state * is really not used at all in the net/core or socket.c but the * initialization makes sure that sock->state is not uninitialized. */ sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern); if (!sk) return -ENOMEM; cf_sk = container_of(sk, struct caifsock, sk); /* Store the protocol */ sk->sk_protocol = (unsigned char) protocol; /* Initialize default priority for well-known cases */ switch (protocol) { case CAIFPROTO_AT: sk->sk_priority = TC_PRIO_CONTROL; break; case CAIFPROTO_RFM: sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; break; default: sk->sk_priority = TC_PRIO_BESTEFFORT; } /* * Lock in order to try to stop someone from opening the socket * too early. */ lock_sock(&(cf_sk->sk)); /* Initialize the nozero default sock structure data. */ sock_init_data(sock, sk); sk->sk_destruct = caif_sock_destructor; mutex_init(&cf_sk->readlock); /* single task reading lock */ cf_sk->layer.ctrlcmd = caif_ctrl_cb; cf_sk->sk.sk_socket->state = SS_UNCONNECTED; cf_sk->sk.sk_state = CAIF_DISCONNECTED; set_tx_flow_off(cf_sk); set_rx_flow_on(cf_sk); /* Set default options on configuration */ cf_sk->conn_req.link_selector = CAIF_LINK_LOW_LATENCY; cf_sk->conn_req.protocol = protocol; release_sock(&cf_sk->sk); return 0; } static const struct net_proto_family caif_family_ops = { .family = PF_CAIF, .create = caif_create, .owner = THIS_MODULE, }; static int __init caif_sktinit_module(void) { return sock_register(&caif_family_ops); } static void __exit caif_sktexit_module(void) { sock_unregister(PF_CAIF); } module_init(caif_sktinit_module); module_exit(caif_sktexit_module); |
12 278 278 278 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM writeback #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WRITEBACK_H #include <linux/tracepoint.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #define show_inode_state(state) \ __print_flags(state, "|", \ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ {I_NEW, "I_NEW"}, \ {I_WILL_FREE, "I_WILL_FREE"}, \ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ {I_REFERENCED, "I_REFERENCED"} \ ) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") WB_WORK_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_page_template, TP_PROTO(struct page *page, struct address_space *mapping), TP_ARGS(page, mapping), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(pgoff_t, index) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = page->index; ), TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, (unsigned long)__entry->ino, __entry->index ) ); DEFINE_EVENT(writeback_page_template, writeback_dirty_page, TP_PROTO(struct page *page, struct address_space *mapping), TP_ARGS(page, mapping) ); DEFINE_EVENT(writeback_page_template, wait_on_page_writeback, TP_PROTO(struct page *page, struct address_space *mapping), TP_ARGS(page, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), TP_fast_assign( struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; ), TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return 1; } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CREATE_TRACE_POINTS */ #ifdef CONFIG_CGROUP_WRITEBACK TRACE_EVENT(inode_foreign_history, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned int history), TP_ARGS(inode, wbc, history), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, cgroup_ino) __field(unsigned int, history) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->history = history; ), TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->cgroup_ino, __entry->history ) ); TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb), TP_ARGS(inode, old_wb, new_wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, old_cgroup_ino) __field(ino_t, new_cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->ino = inode->i_ino; __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->old_cgroup_ino, (unsigned long)__entry->new_cgroup_ino ) ); TRACE_EVENT(track_foreign_dirty, TP_PROTO(struct page *page, struct bdi_writeback *wb), TP_ARGS(page, wb), TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) __field(ino_t, ino) __field(unsigned int, memcg_id) __field(ino_t, cgroup_ino) __field(ino_t, page_cgroup_ino) ), TP_fast_assign( struct address_space *mapping = page_mapping(page); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->bdi_id = wb->bdi->id; __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, (unsigned long)__entry->ino, __entry->memcg_id, (unsigned long)__entry->cgroup_ino, (unsigned long)__entry->page_cgroup_ino ) ); TRACE_EVENT(flush_foreign, TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id, unsigned int frn_memcg_id), TP_ARGS(wb, frn_bdi_id, frn_memcg_id), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->frn_bdi_id = frn_bdi_id; __entry->frn_memcg_id = frn_memcg_id; ), TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) ); #endif DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(int, sync_mode) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, __entry->sync_mode, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) __field(dev_t, sb_dev) __field(int, sync_mode) __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); DEFINE_WRITEBACK_WORK_EVENT(writeback_written); DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), TP_ARGS(pages_written), TP_STRUCT__entry( __field(long, pages) ), TP_fast_assign( __entry->pages = pages_written; ), TP_printk("%ld", __entry->pages) ); DECLARE_EVENT_CLASS(writeback_class, TP_PROTO(struct bdi_writeback *wb), TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ TP_PROTO(struct bdi_writeback *wb), \ TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_wake_background); TRACE_EVENT(writeback_bdi_register, TP_PROTO(struct backing_dev_info *bdi), TP_ARGS(bdi), TP_STRUCT__entry( __array(char, name, 32) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); ), TP_printk("bdi %s", __entry->name ) ); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), TP_ARGS(wbc, bdi), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, __entry->range_start, __entry->range_end, (unsigned long)__entry->cgroup_ino ) ) #define DEFINE_WBC_EVENT(name) \ DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before, int moved), TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) __field(long, age) __field(int, moved) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->older = dirtied_before; __entry->age = (jiffies - dirtied_before) * 1000 / HZ; __entry->moved = moved; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* dirtied_before in jiffies */ __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(global_dirty_state, TP_PROTO(unsigned long background_thresh, unsigned long dirty_thresh ), TP_ARGS(background_thresh, dirty_thresh ), TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) __field(unsigned long, nr_dirtied) __field(unsigned long, nr_written) ), TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, __entry->nr_dirtied, __entry->nr_written ) ); #define KBps(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(bdi_dirty_ratelimit, TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) __field(unsigned long, write_bw) __field(unsigned long, avg_write_bw) __field(unsigned long, dirty_rate) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = KBps(wb->balanced_dirty_ratelimit); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, unsigned long thresh, unsigned long bg_thresh, unsigned long dirty, unsigned long bdi_thresh, unsigned long bdi_dirty, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, unsigned long period, long pause, unsigned long start_time), TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) __field(unsigned long, bdi_setpoint) __field(unsigned long, bdi_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) __field(unsigned long, period) __field( long, think) __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + freerun) / 2; __entry->dirty = dirty; __entry->bdi_setpoint = __entry->setpoint * bdi_thresh / (thresh + 1); __entry->bdi_dirty = bdi_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; __entry->think = current->dirty_paused_when == 0 ? 0 : (long)(jiffies - current->dirty_paused_when) * 1000/HZ; __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, __entry->dirty, __entry->bdi_setpoint, __entry->bdi_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(writeback_sb_inodes_requeue, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, (unsigned long)__entry->cgroup_ino ) ); DECLARE_EVENT_CLASS(writeback_congest_waited_template, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), TP_ARGS(usec_timeout, usec_delayed), TP_STRUCT__entry( __field( unsigned int, usec_timeout ) __field( unsigned int, usec_delayed ) ), TP_fast_assign( __entry->usec_timeout = usec_timeout; __entry->usec_delayed = usec_delayed; ), TP_printk("usec_timeout=%u usec_delayed=%u", __entry->usec_timeout, __entry->usec_delayed) ); DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), TP_ARGS(usec_timeout, usec_delayed) ); DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), TP_ARGS(usec_timeout, usec_delayed) ); DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write ), TP_ARGS(inode, wbc, nr_to_write), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DECLARE_EVENT_CLASS(writeback_inode_template, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* * Inode writeback list tracking. */ DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 2003 International Business Machines, Corp. * * This file is part of the SCTP kernel implementation * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Sridhar Samudrala <sri@us.ibm.com> */ #include <linux/types.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/export.h> #include <net/sctp/sctp.h> #include <net/ip.h> /* for snmp_fold_field */ static const struct snmp_mib sctp_snmp_list[] = { SNMP_MIB_ITEM("SctpCurrEstab", SCTP_MIB_CURRESTAB), SNMP_MIB_ITEM("SctpActiveEstabs", SCTP_MIB_ACTIVEESTABS), SNMP_MIB_ITEM("SctpPassiveEstabs", SCTP_MIB_PASSIVEESTABS), SNMP_MIB_ITEM("SctpAborteds", SCTP_MIB_ABORTEDS), SNMP_MIB_ITEM("SctpShutdowns", SCTP_MIB_SHUTDOWNS), SNMP_MIB_ITEM("SctpOutOfBlues", SCTP_MIB_OUTOFBLUES), SNMP_MIB_ITEM("SctpChecksumErrors", SCTP_MIB_CHECKSUMERRORS), SNMP_MIB_ITEM("SctpOutCtrlChunks", SCTP_MIB_OUTCTRLCHUNKS), SNMP_MIB_ITEM("SctpOutOrderChunks", SCTP_MIB_OUTORDERCHUNKS), SNMP_MIB_ITEM("SctpOutUnorderChunks", SCTP_MIB_OUTUNORDERCHUNKS), SNMP_MIB_ITEM("SctpInCtrlChunks", SCTP_MIB_INCTRLCHUNKS), SNMP_MIB_ITEM("SctpInOrderChunks", SCTP_MIB_INORDERCHUNKS), SNMP_MIB_ITEM("SctpInUnorderChunks", SCTP_MIB_INUNORDERCHUNKS), SNMP_MIB_ITEM("SctpFragUsrMsgs", SCTP_MIB_FRAGUSRMSGS), SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS), SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS), SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS), SNMP_MIB_ITEM("SctpT1InitExpireds", SCTP_MIB_T1_INIT_EXPIREDS), SNMP_MIB_ITEM("SctpT1CookieExpireds", SCTP_MIB_T1_COOKIE_EXPIREDS), SNMP_MIB_ITEM("SctpT2ShutdownExpireds", SCTP_MIB_T2_SHUTDOWN_EXPIREDS), SNMP_MIB_ITEM("SctpT3RtxExpireds", SCTP_MIB_T3_RTX_EXPIREDS), SNMP_MIB_ITEM("SctpT4RtoExpireds", SCTP_MIB_T4_RTO_EXPIREDS), SNMP_MIB_ITEM("SctpT5ShutdownGuardExpireds", SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS), SNMP_MIB_ITEM("SctpDelaySackExpireds", SCTP_MIB_DELAY_SACK_EXPIREDS), SNMP_MIB_ITEM("SctpAutocloseExpireds", SCTP_MIB_AUTOCLOSE_EXPIREDS), SNMP_MIB_ITEM("SctpT3Retransmits", SCTP_MIB_T3_RETRANSMITS), SNMP_MIB_ITEM("SctpPmtudRetransmits", SCTP_MIB_PMTUD_RETRANSMITS), SNMP_MIB_ITEM("SctpFastRetransmits", SCTP_MIB_FAST_RETRANSMITS), SNMP_MIB_ITEM("SctpInPktSoftirq", SCTP_MIB_IN_PKT_SOFTIRQ), SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG), SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS), SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS), SNMP_MIB_SENTINEL }; /* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ static int sctp_snmp_seq_show(struct seq_file *seq, void *v) { unsigned long buff[SCTP_MIB_MAX]; struct net *net = seq->private; int i; memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX); snmp_get_cpu_field_batch(buff, sctp_snmp_list, net->sctp.sctp_statistics); for (i = 0; sctp_snmp_list[i].name; i++) seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, buff[i]); return 0; } /* Dump local addresses of an association/endpoint. */ static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb) { struct sctp_association *asoc; struct sctp_sockaddr_entry *laddr; struct sctp_transport *peer; union sctp_addr *addr, *primary = NULL; struct sctp_af *af; if (epb->type == SCTP_EP_TYPE_ASSOCIATION) { asoc = sctp_assoc(epb); peer = asoc->peer.primary_path; if (unlikely(peer == NULL)) { WARN(1, "Association %p with NULL primary path!\n", asoc); return; } primary = &peer->saddr; } rcu_read_lock(); list_for_each_entry_rcu(laddr, &epb->bind_addr.address_list, list) { if (!laddr->valid) continue; addr = &laddr->a; af = sctp_get_af_specific(addr->sa.sa_family); if (primary && af->cmp_addr(addr, primary)) { seq_printf(seq, "*"); } af->seq_dump_addr(seq, addr); } rcu_read_unlock(); } /* Dump remote addresses of an association. */ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_association *assoc) { struct sctp_transport *transport; union sctp_addr *addr, *primary; struct sctp_af *af; primary = &assoc->peer.primary_addr; list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list, transports) { addr = &transport->ipaddr; af = sctp_get_af_specific(addr->sa.sa_family); if (af->cmp_addr(addr, primary)) { seq_printf(seq, "*"); } af->seq_dump_addr(seq, addr); } } static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos >= sctp_ep_hashsize) return NULL; if (*pos < 0) *pos = 0; if (*pos == 0) seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT UID INODE LADDRS\n"); return (void *)pos; } static void sctp_eps_seq_stop(struct seq_file *seq, void *v) { } static void *sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos) { if (++*pos >= sctp_ep_hashsize) return NULL; return pos; } /* Display sctp endpoints (/proc/net/sctp/eps). */ static int sctp_eps_seq_show(struct seq_file *seq, void *v) { struct sctp_hashbucket *head; struct sctp_endpoint *ep; struct sock *sk; int hash = *(loff_t *)v; if (hash >= sctp_ep_hashsize) return -ENOMEM; head = &sctp_ep_hashtable[hash]; read_lock_bh(&head->lock); sctp_for_each_hentry(ep, &head->chain) { sk = ep->base.sk; if (!net_eq(sock_net(sk), seq_file_net(seq))) continue; seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5u %5lu ", ep, sk, sctp_sk(sk)->type, sk->sk_state, hash, ep->base.bind_addr.port, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk)); sctp_seq_dump_local_addrs(seq, &ep->base); seq_printf(seq, "\n"); } read_unlock_bh(&head->lock); return 0; } static const struct seq_operations sctp_eps_ops = { .start = sctp_eps_seq_start, .next = sctp_eps_seq_next, .stop = sctp_eps_seq_stop, .show = sctp_eps_seq_show, }; struct sctp_ht_iter { struct seq_net_private p; struct rhashtable_iter hti; }; static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; sctp_transport_walk_start(&iter->hti); return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos); } static void sctp_transport_seq_stop(struct seq_file *seq, void *v) { struct sctp_ht_iter *iter = seq->private; if (v && v != SEQ_START_TOKEN) { struct sctp_transport *transport = v; sctp_transport_put(transport); } sctp_transport_walk_stop(&iter->hti); } static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; if (v && v != SEQ_START_TOKEN) { struct sctp_transport *transport = v; sctp_transport_put(transport); } ++*pos; return sctp_transport_get_next(seq_file_net(seq), &iter->hti); } /* Display sctp associations (/proc/net/sctp/assocs). */ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) { struct sctp_transport *transport; struct sctp_association *assoc; struct sctp_ep_common *epb; struct sock *sk; if (v == SEQ_START_TOKEN) { seq_printf(seq, " ASSOC SOCK STY SST ST HBKT " "ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT " "RPORT LADDRS <-> RADDRS " "HBINT INS OUTS MAXRT T1X T2X RTXC " "wmema wmemq sndbuf rcvbuf\n"); return 0; } transport = (struct sctp_transport *)v; assoc = transport->asoc; epb = &assoc->base; sk = epb->sk; seq_printf(seq, "%8pK %8pK %-3d %-3d %-2d %-4d " "%4d %8d %8d %7u %5lu %-5d %5d ", assoc, sk, sctp_sk(sk)->type, sk->sk_state, assoc->state, 0, assoc->assoc_id, assoc->sndbuf_used, atomic_read(&assoc->rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), epb->bind_addr.port, assoc->peer.port); seq_printf(seq, " "); sctp_seq_dump_local_addrs(seq, epb); seq_printf(seq, "<-> "); sctp_seq_dump_remote_addrs(seq, assoc); seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d " "%8d %8d %8d %8d", assoc->hbinterval, assoc->stream.incnt, assoc->stream.outcnt, assoc->max_retrans, assoc->init_retries, assoc->shutdown_retries, assoc->rtx_data_chunks, refcount_read(&sk->sk_wmem_alloc), READ_ONCE(sk->sk_wmem_queued), sk->sk_sndbuf, sk->sk_rcvbuf); seq_printf(seq, "\n"); return 0; } static const struct seq_operations sctp_assoc_ops = { .start = sctp_transport_seq_start, .next = sctp_transport_seq_next, .stop = sctp_transport_seq_stop, .show = sctp_assocs_seq_show, }; static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) { struct sctp_association *assoc; struct sctp_transport *transport, *tsp; if (v == SEQ_START_TOKEN) { seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX " "REM_ADDR_RTX START STATE\n"); return 0; } transport = (struct sctp_transport *)v; assoc = transport->asoc; list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list, transports) { /* * The remote address (ADDR) */ tsp->af_specific->seq_dump_addr(seq, &tsp->ipaddr); seq_printf(seq, " "); /* * The association ID (ASSOC_ID) */ seq_printf(seq, "%d ", tsp->asoc->assoc_id); /* * If the Heartbeat is active (HB_ACT) * Note: 1 = Active, 0 = Inactive */ seq_printf(seq, "%d ", timer_pending(&tsp->hb_timer)); /* * Retransmit time out (RTO) */ seq_printf(seq, "%lu ", tsp->rto); /* * Maximum path retransmit count (PATH_MAX_RTX) */ seq_printf(seq, "%d ", tsp->pathmaxrxt); /* * remote address retransmit count (REM_ADDR_RTX) * Note: We don't have a way to tally this at the moment * so lets just leave it as zero for the moment */ seq_puts(seq, "0 "); /* * remote address start time (START). This is also not * currently implemented, but we can record it with a * jiffies marker in a subsequent patch */ seq_puts(seq, "0 "); /* * The current state of this destination. I.e. * SCTP_ACTIVE, SCTP_INACTIVE, ... */ seq_printf(seq, "%d", tsp->state); seq_printf(seq, "\n"); } return 0; } static const struct seq_operations sctp_remaddr_ops = { .start = sctp_transport_seq_start, .next = sctp_transport_seq_next, .stop = sctp_transport_seq_stop, .show = sctp_remaddr_seq_show, }; /* Set up the proc fs entry for the SCTP protocol. */ int __net_init sctp_proc_init(struct net *net) { net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net); if (!net->sctp.proc_net_sctp) return -ENOMEM; if (!proc_create_net_single("snmp", 0444, net->sctp.proc_net_sctp, sctp_snmp_seq_show, NULL)) goto cleanup; if (!proc_create_net("eps", 0444, net->sctp.proc_net_sctp, &sctp_eps_ops, sizeof(struct seq_net_private))) goto cleanup; if (!proc_create_net("assocs", 0444, net->sctp.proc_net_sctp, &sctp_assoc_ops, sizeof(struct sctp_ht_iter))) goto cleanup; if (!proc_create_net("remaddr", 0444, net->sctp.proc_net_sctp, &sctp_remaddr_ops, sizeof(struct sctp_ht_iter))) goto cleanup; return 0; cleanup: remove_proc_subtree("sctp", net->proc_net); net->sctp.proc_net_sctp = NULL; return -ENOMEM; } |
48 48 48 48 48 48 47 48 48 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 | // SPDX-License-Identifier: GPL-2.0-or-later /* Maintain an RxRPC server socket to do AFS communications through * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "internal.h" #include "afs_cm.h" #include "protocol_yfs.h" struct workqueue_struct *afs_async_calls; static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); static int afs_deliver_cm_op_id(struct afs_call *); /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", .deliver = afs_deliver_cm_op_id, }; /* * open an RxRPC socket and bind it to be a server for callback notifications * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT */ int afs_open_socket(struct afs_net *net) { struct sockaddr_rxrpc srx; struct socket *socket; int ret; _enter(""); ret = sock_create_kern(net->net, AF_RXRPC, SOCK_DGRAM, PF_INET6, &socket); if (ret < 0) goto error_1; socket->sk->sk_allocation = GFP_NOFS; /* bind the callback manager's address to make this a server socket */ memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; srx.srx_service = CM_SERVICE; srx.transport_type = SOCK_DGRAM; srx.transport_len = sizeof(srx.transport.sin6); srx.transport.sin6.sin6_family = AF_INET6; srx.transport.sin6.sin6_port = htons(AFS_CM_PORT); ret = rxrpc_sock_set_min_security_level(socket->sk, RXRPC_SECURITY_ENCRYPT); if (ret < 0) goto error_2; ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); } if (ret < 0) goto error_2; srx.srx_service = YFS_CM_SERVICE; ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret < 0) goto error_2; /* Ideally, we'd turn on service upgrade here, but we can't because * OpenAFS is buggy and leaks the userStatus field from packet to * packet and between FS packets and CB packets - so if we try to do an * upgrade on an FS packet, OpenAFS will leak that into the CB packet * it sends back to us. */ rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, afs_rx_discard_new_call); ret = kernel_listen(socket, INT_MAX); if (ret < 0) goto error_2; net->socket = socket; afs_charge_preallocation(&net->charge_preallocation_work); _leave(" = 0"); return 0; error_2: sock_release(socket); error_1: _leave(" = %d", ret); return ret; } /* * close the RxRPC socket AFS was using */ void afs_close_socket(struct afs_net *net) { _enter(""); kernel_listen(net->socket, 0); flush_workqueue(afs_async_calls); if (net->spare_incoming_call) { afs_put_call(net->spare_incoming_call); net->spare_incoming_call = NULL; } _debug("outstanding %u", atomic_read(&net->nr_outstanding_calls)); wait_var_event(&net->nr_outstanding_calls, !atomic_read(&net->nr_outstanding_calls)); _debug("no outstanding calls"); kernel_sock_shutdown(net->socket, SHUT_RDWR); flush_workqueue(afs_async_calls); sock_release(net->socket); _debug("dework"); _leave(""); } /* * Allocate a call. */ static struct afs_call *afs_alloc_call(struct afs_net *net, const struct afs_call_type *type, gfp_t gfp) { struct afs_call *call; int o; call = kzalloc(sizeof(*call), gfp); if (!call) return NULL; call->type = type; call->net = net; call->debug_id = atomic_inc_return(&rxrpc_debug_id); refcount_set(&call->ref, 1); INIT_WORK(&call->async_work, afs_process_async_call); init_waitqueue_head(&call->waitq); spin_lock_init(&call->state_lock); call->iter = &call->def_iter; o = atomic_inc_return(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_alloc, 1, o, __builtin_return_address(0)); return call; } /* * Dispose of a reference on a call. */ void afs_put_call(struct afs_call *call) { struct afs_net *net = call->net; bool zero; int r, o; zero = __refcount_dec_and_test(&call->ref, &r); o = atomic_read(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_put, r - 1, o, __builtin_return_address(0)); if (zero) { ASSERT(!work_pending(&call->async_work)); ASSERT(call->type->name != NULL); if (call->rxcall) { rxrpc_kernel_end_call(net->socket, call->rxcall); call->rxcall = NULL; } if (call->type->destructor) call->type->destructor(call); afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); afs_put_addrlist(call->alist); kfree(call->request); trace_afs_call(call, afs_call_trace_free, 0, o, __builtin_return_address(0)); kfree(call); o = atomic_dec_return(&net->nr_outstanding_calls); if (o == 0) wake_up_var(&net->nr_outstanding_calls); } } static struct afs_call *afs_get_call(struct afs_call *call, enum afs_call_trace why) { int r; __refcount_inc(&call->ref, &r); trace_afs_call(call, why, r + 1, atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); return call; } /* * Queue the call for actual work. */ static void afs_queue_call_work(struct afs_call *call) { if (call->type->work) { INIT_WORK(&call->work, call->type->work); afs_get_call(call, afs_call_trace_work); if (!queue_work(afs_wq, &call->work)) afs_put_call(call); } } /* * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(struct afs_net *net, const struct afs_call_type *type, size_t request_size, size_t reply_max) { struct afs_call *call; call = afs_alloc_call(net, type, GFP_NOFS); if (!call) goto nomem_call; if (request_size) { call->request_size = request_size; call->request = kmalloc(request_size, GFP_NOFS); if (!call->request) goto nomem_free; } if (reply_max) { call->reply_max = reply_max; call->buffer = kmalloc(reply_max, GFP_NOFS); if (!call->buffer) goto nomem_free; } afs_extract_to_buf(call, call->reply_max); call->operation_ID = type->op; init_waitqueue_head(&call->waitq); return call; nomem_free: afs_put_call(call); nomem_call: return NULL; } /* * clean up a call with flat buffer */ void afs_flat_call_destructor(struct afs_call *call) { _enter(""); kfree(call->request); call->request = NULL; kfree(call->buffer); call->buffer = NULL; } /* * Advance the AFS call state when the RxRPC call ends the transmit phase. */ static void afs_notify_end_request_tx(struct sock *sock, struct rxrpc_call *rxcall, unsigned long call_user_ID) { struct afs_call *call = (struct afs_call *)call_user_ID; afs_set_call_state(call, AFS_CALL_CL_REQUESTING, AFS_CALL_CL_AWAIT_REPLY); } /* * Initiate a call and synchronously queue up the parameters for dispatch. Any * error is stored into the call struct, which the caller must check for. */ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) { struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index]; struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; size_t len; s64 tx_total_len; int ret; _enter(",{%pISp},", &srx->transport); ASSERT(call->type != NULL); ASSERT(call->type->name != NULL); _debug("____MAKE %p{%s,%x} [%d]____", call, call->type->name, key_serial(call->key), atomic_read(&call->net->nr_outstanding_calls)); call->addr_ix = ac->index; call->alist = afs_get_addrlist(ac->alist); /* Work out the length we're going to transmit. This is awkward for * calls such as FS.StoreData where there's an extra injection of data * after the initial fixed part. */ tx_total_len = call->request_size; if (call->write_iter) tx_total_len += iov_iter_count(call->write_iter); /* If the call is going to be asynchronous, we need an extra ref for * the call to hold itself so the caller need not hang on to its ref. */ if (call->async) { afs_get_call(call, afs_call_trace_get); call->drop_ref = true; } /* create a call */ rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key, (unsigned long)call, tx_total_len, gfp, (call->async ? afs_wake_up_async_call : afs_wake_up_call_waiter), call->upgrade, (call->intr ? RXRPC_PREINTERRUPTIBLE : RXRPC_UNINTERRUPTIBLE), call->debug_id); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); call->error = ret; goto error_kill_call; } call->rxcall = rxcall; if (call->max_lifespan) rxrpc_kernel_set_max_life(call->net->socket, rxcall, call->max_lifespan); call->issue_time = ktime_get_real(); /* send the request */ iov[0].iov_base = call->request; iov[0].iov_len = call->request_size; msg.msg_name = NULL; msg.msg_namelen = 0; iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0); ret = rxrpc_kernel_send_data(call->net->socket, rxcall, &msg, call->request_size, afs_notify_end_request_tx); if (ret < 0) goto error_do_abort; if (call->write_iter) { msg.msg_iter = *call->write_iter; msg.msg_flags &= ~MSG_MORE; trace_afs_send_data(call, &msg); ret = rxrpc_kernel_send_data(call->net->socket, call->rxcall, &msg, iov_iter_count(&msg.msg_iter), afs_notify_end_request_tx); *call->write_iter = msg.msg_iter; trace_afs_sent_data(call, &msg, ret); if (ret < 0) goto error_do_abort; } /* Note that at this point, we may have received the reply or an abort * - and an asynchronous call may already have completed. * * afs_wait_for_call_to_complete(call, ac) * must be called to synchronously clean up. */ return; error_do_abort: if (ret != -ECONNABORTED) { rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); } else { len = 0; iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); ac->abort_code = call->abort_code; ac->responded = true; } call->error = ret; trace_afs_call_done(call); error_kill_call: if (call->type->done) call->type->done(call); /* We need to dispose of the extra ref we grabbed for an async call. * The call, however, might be queued on afs_async_calls and we need to * make sure we don't get any more notifications that might requeue it. */ if (call->rxcall) { rxrpc_kernel_end_call(call->net->socket, call->rxcall); call->rxcall = NULL; } if (call->async) { if (cancel_work_sync(&call->async_work)) afs_put_call(call); afs_set_call_complete(call, ret, 0); } ac->error = ret; call->state = AFS_CALL_COMPLETE; _leave(" = %d", ret); } /* * Log remote abort codes that indicate that we have a protocol disagreement * with the server. */ static void afs_log_error(struct afs_call *call, s32 remote_abort) { static int max = 0; const char *msg; int m; switch (remote_abort) { case RX_EOF: msg = "unexpected EOF"; break; case RXGEN_CC_MARSHAL: msg = "client marshalling"; break; case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling"; break; case RXGEN_SS_MARSHAL: msg = "server marshalling"; break; case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling"; break; case RXGEN_DECODE: msg = "opcode decode"; break; case RXGEN_SS_XDRFREE: msg = "server XDR cleanup"; break; case RXGEN_CC_XDRFREE: msg = "client XDR cleanup"; break; case -32: msg = "insufficient data"; break; default: return; } m = max; if (m < 3) { max = m + 1; pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", msg, call->type->name, &call->alist->addrs[call->addr_ix].transport); } } /* * deliver messages to a call */ static void afs_deliver_to_call(struct afs_call *call) { enum afs_call_state state; size_t len; u32 abort_code, remote_abort = 0; int ret; _enter("%s", call->type->name); while (state = READ_ONCE(call->state), state == AFS_CALL_CL_AWAIT_REPLY || state == AFS_CALL_SV_AWAIT_OP_ID || state == AFS_CALL_SV_AWAIT_REQUEST || state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { len = 0; iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, call->rxcall, &call->def_iter, &len, false, &remote_abort, &call->service_id); trace_afs_receive_data(call, &call->def_iter, false, ret); if (ret == -EINPROGRESS || ret == -EAGAIN) return; if (ret < 0 || ret == 1) { if (ret == 1) ret = 0; goto call_complete; } return; } ret = call->type->deliver(call); state = READ_ONCE(call->state); if (ret == 0 && call->unmarshalling_error) ret = -EBADMSG; switch (ret) { case 0: afs_queue_call_work(call); if (state == AFS_CALL_CL_PROC_REPLY) { if (call->op) set_bit(AFS_SERVER_FL_MAY_HAVE_CB, &call->op->server->flags); goto call_complete; } ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY); goto done; case -EINPROGRESS: case -EAGAIN: goto out; case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); afs_log_error(call, call->abort_code); goto done; case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KIV"); goto local_abort; case -EIO: pr_err("kAFS: Call %u in bad state %u\n", call->debug_id, state); fallthrough; case -ENODATA: case -EBADMSG: case -EMSGSIZE: case -ENOMEM: case -EFAULT: abort_code = RXGEN_CC_UNMARSHAL; if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KUM"); goto local_abort; default: abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KER"); goto local_abort; } } done: if (call->type->done) call->type->done(call); out: _leave(""); return; local_abort: abort_code = 0; call_complete: afs_set_call_complete(call, ret, remote_abort); state = AFS_CALL_COMPLETE; goto done; } /* * Wait synchronously for a call to complete and clean up the call struct. */ long afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac) { long ret; bool rxrpc_complete = false; DECLARE_WAITQUEUE(myself, current); _enter(""); ret = call->error; if (ret < 0) goto out; add_wait_queue(&call->waitq, &myself); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); /* deliver any messages that are in the queue */ if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && call->need_attention) { call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); continue; } if (afs_check_call_state(call, AFS_CALL_COMPLETE)) break; if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { /* rxrpc terminated the call. */ rxrpc_complete = true; break; } schedule(); } remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { if (rxrpc_complete) { afs_set_call_complete(call, call->error, call->abort_code); } else { /* Kill off the call if it's still live. */ _debug("call interrupted"); if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, RX_USER_ABORT, -EINTR, "KWI")) afs_set_call_complete(call, -EINTR, 0); } } spin_lock_bh(&call->state_lock); ac->abort_code = call->abort_code; ac->error = call->error; spin_unlock_bh(&call->state_lock); ret = ac->error; switch (ret) { case 0: ret = call->ret0; call->ret0 = 0; fallthrough; case -ECONNABORTED: ac->responded = true; break; } out: _debug("call complete"); afs_put_call(call); _leave(" = %p", (void *)ret); return ret; } /* * wake up a waiting call */ static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, unsigned long call_user_ID) { struct afs_call *call = (struct afs_call *)call_user_ID; call->need_attention = true; wake_up(&call->waitq); } /* * wake up an asynchronous call */ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long call_user_ID) { struct afs_call *call = (struct afs_call *)call_user_ID; int r; trace_afs_notify_call(rxcall, call); call->need_attention = true; if (__refcount_inc_not_zero(&call->ref, &r)) { trace_afs_call(call, afs_call_trace_wake, r + 1, atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); if (!queue_work(afs_async_calls, &call->async_work)) afs_put_call(call); } } /* * Perform I/O processing on an asynchronous call. The work item carries a ref * to the call struct that we either need to release or to pass on. */ static void afs_process_async_call(struct work_struct *work) { struct afs_call *call = container_of(work, struct afs_call, async_work); _enter(""); if (call->state < AFS_CALL_COMPLETE && call->need_attention) { call->need_attention = false; afs_deliver_to_call(call); } afs_put_call(call); _leave(""); } static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) { struct afs_call *call = (struct afs_call *)user_call_ID; call->rxcall = rxcall; } /* * Charge the incoming call preallocation. */ void afs_charge_preallocation(struct work_struct *work) { struct afs_net *net = container_of(work, struct afs_net, charge_preallocation_work); struct afs_call *call = net->spare_incoming_call; for (;;) { if (!call) { call = afs_alloc_call(net, &afs_RXCMxxxx, GFP_KERNEL); if (!call) break; call->drop_ref = true; call->async = true; call->state = AFS_CALL_SV_AWAIT_OP_ID; init_waitqueue_head(&call->waitq); afs_extract_to_tmp(call); } if (rxrpc_kernel_charge_accept(net->socket, afs_wake_up_async_call, afs_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) break; call = NULL; } net->spare_incoming_call = call; } /* * Discard a preallocated call when a socket is shut down. */ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, unsigned long user_call_ID) { struct afs_call *call = (struct afs_call *)user_call_ID; call->rxcall = NULL; afs_put_call(call); } /* * Notification of an incoming call. */ static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long user_call_ID) { struct afs_net *net = afs_sock2net(sk); queue_work(afs_wq, &net->charge_preallocation_work); } /* * Grab the operation ID from an incoming cache manager call. The socket * buffer is discarded on error or if we don't yet have sufficient data. */ static int afs_deliver_cm_op_id(struct afs_call *call) { int ret; _enter("{%zu}", iov_iter_count(call->iter)); /* the operation ID forms the first four bytes of the request data */ ret = afs_extract_data(call, true); if (ret < 0) return ret; call->operation_ID = ntohl(call->tmp); afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST); /* ask the cache manager to route the call (it'll change the call type * if successful) */ if (!afs_cm_incoming_call(call)) return -ENOTSUPP; trace_afs_cb_call(call); /* pass responsibility for the remainer of this message off to the * cache manager op */ return call->type->deliver(call); } /* * Advance the AFS call state when an RxRPC service call ends the transmit * phase. */ static void afs_notify_end_reply_tx(struct sock *sock, struct rxrpc_call *rxcall, unsigned long call_user_ID) { struct afs_call *call = (struct afs_call *)call_user_ID; afs_set_call_state(call, AFS_CALL_SV_REPLYING, AFS_CALL_SV_AWAIT_ACK); } /* * send an empty reply */ void afs_send_empty_reply(struct afs_call *call) { struct afs_net *net = call->net; struct msghdr msg; _enter(""); rxrpc_kernel_set_tx_length(net->socket, call->rxcall, 0); msg.msg_name = NULL; msg.msg_namelen = 0; iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; switch (rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, 0, afs_notify_end_reply_tx)) { case 0: _leave(" [replied]"); return; case -ENOMEM: _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, RXGEN_SS_MARSHAL, -ENOMEM, "KOO"); fallthrough; default: _leave(" [error]"); return; } } /* * send a simple reply */ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) { struct afs_net *net = call->net; struct msghdr msg; struct kvec iov[1]; int n; _enter(""); rxrpc_kernel_set_tx_length(net->socket, call->rxcall, len); iov[0].iov_base = (void *) buf; iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; n = rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, len, afs_notify_end_reply_tx); if (n >= 0) { /* Success */ _leave(" [replied]"); return; } if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, RXGEN_SS_MARSHAL, -ENOMEM, "KOO"); } _leave(" [error]"); } /* * Extract a piece of data from the received data socket buffers. */ int afs_extract_data(struct afs_call *call, bool want_more) { struct afs_net *net = call->net; struct iov_iter *iter = call->iter; enum afs_call_state state; u32 remote_abort = 0; int ret; _enter("{%s,%zu,%zu},%d", call->type->name, call->iov_len, iov_iter_count(iter), want_more); ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter, &call->iov_len, want_more, &remote_abort, &call->service_id); if (ret == 0 || ret == -EAGAIN) return ret; state = READ_ONCE(call->state); if (ret == 1) { switch (state) { case AFS_CALL_CL_AWAIT_REPLY: afs_set_call_state(call, state, AFS_CALL_CL_PROC_REPLY); break; case AFS_CALL_SV_AWAIT_REQUEST: afs_set_call_state(call, state, AFS_CALL_SV_REPLYING); break; case AFS_CALL_COMPLETE: kdebug("prem complete %d", call->error); return afs_io_error(call, afs_io_error_extract); default: break; } return 0; } afs_set_call_complete(call, ret, remote_abort); return ret; } /* * Log protocol error production. */ noinline int afs_protocol_error(struct afs_call *call, enum afs_eproto_cause cause) { trace_afs_protocol_error(call, cause); if (call) call->unmarshalling_error = true; return -EBADMSG; } |
214 214 10 10 213 155 76 75 76 213 76 155 20 12 20 20 19 20 20 1209 20 20 20 20 20 20 20 115 115 13 14 14 14 13 14 14 14 17 17 4 17 8 17 17 21 21 21 7 19 19 18 14 17 18 21 1201 1204 1203 1202 1197 1201 1203 1202 1114 1119 1116 114 114 99 109 115 115 114 115 114 115 114 115 115 115 115 115 385 386 388 76 386 386 137 135 135 135 136 375 133 385 29 18 29 381 382 382 382 382 383 383 382 379 381 383 383 380 29 29 29 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swap.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. */ #include <linux/mm.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? */ int page_cluster; /* Protecting only lru_rotate.pvec which requires disabling interrupts */ struct lru_rotate { local_lock_t lock; struct pagevec pvec; }; static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { .lock = INIT_LOCAL_LOCK(lock), }; /* * The following struct pagevec are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ struct lru_pvecs { local_lock_t lock; struct pagevec lru_add; struct pagevec lru_deactivate_file; struct pagevec lru_deactivate; struct pagevec lru_lazyfree; #ifdef CONFIG_SMP struct pagevec activate_page; #endif }; static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { .lock = INIT_LOCAL_LOCK(lock), }; /* * This path almost never happens for VM activity - pages are normally * freed via pagevecs. But it gets used by networking. */ static void __page_cache_release(struct page *page) { if (PageLRU(page)) { struct lruvec *lruvec; unsigned long flags; lruvec = lock_page_lruvec_irqsave(page, &flags); del_page_from_lru_list(page, lruvec); __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); } static void __put_single_page(struct page *page) { __page_cache_release(page); mem_cgroup_uncharge(page); free_unref_page(page, 0); } static void __put_compound_page(struct page *page) { /* * __page_cache_release() is supposed to be called for thp, not for * hugetlb. This is because hugetlb page does never have PageLRU set * (it's never listed to any LRU lists) and no memcg routines should * be called for hugetlb (it has a separate hugetlb_cgroup.) */ if (!PageHuge(page)) __page_cache_release(page); destroy_compound_page(page); } void __put_page(struct page *page) { if (is_zone_device_page(page)) { put_dev_pagemap(page->pgmap); /* * The page belongs to the device that created pgmap. Do * not return it to page allocator. */ return; } if (unlikely(PageCompound(page))) __put_compound_page(page); else __put_single_page(page); } EXPORT_SYMBOL(__put_page); /** * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru * * Release a list of pages which are strung together on page.lru. Currently * used by read_cache_pages() and related error recovery code. */ void put_pages_list(struct list_head *pages) { while (!list_empty(pages)) { struct page *victim; victim = lru_to_page(pages); list_del(&victim->lru); put_page(victim); } } EXPORT_SYMBOL(put_pages_list); /* * get_kernel_pages() - pin kernel pages in memory * @kiov: An array of struct kvec structures * @nr_segs: number of segments to pin * @write: pinning for read/write, currently ignored * @pages: array that receives pointers to the pages pinned. * Should be at least nr_segs long. * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. Each page returned must be released * with a put_page() call when it is finished with. */ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, struct page **pages) { int seg; for (seg = 0; seg < nr_segs; seg++) { if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) return seg; pages[seg] = kmap_to_page(kiov[seg].iov_base); get_page(pages[seg]); } return seg; } EXPORT_SYMBOL_GPL(get_kernel_pages); static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec)) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; /* block memcg migration during page moving between lru */ if (!TestClearPageLRU(page)) continue; lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags); (*move_fn)(page, lruvec); SetPageLRU(page); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) { if (!PageUnevictable(page)) { del_page_from_lru_list(page, lruvec); ClearPageActive(page); add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, thp_nr_pages(page)); } } /* return true if pagevec needs to drain */ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) { bool ret = false; if (!pagevec_add(pvec, page) || PageCompound(page) || lru_cache_disabled()) ret = true; return ret; } /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. * * rotate_reclaimable_page() must disable IRQs, to prevent nasty races. */ void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageUnevictable(page) && PageLRU(page)) { struct pagevec *pvec; unsigned long flags; get_page(page); local_lock_irqsave(&lru_rotate.lock, flags); pvec = this_cpu_ptr(&lru_rotate.pvec); if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } } void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { do { unsigned long lrusize; /* * Hold lruvec->lru_lock is safe here, since * 1) The pinned lruvec in reclaim, or * 2) From a pre-LRU page during refault (which also holds the * rcu lock, so would be safe even if the page was on the LRU * and could move simultaneously to a new lruvec). */ spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += nr_pages; else lruvec->anon_cost += nr_pages; /* * Decay previous events * * Because workloads change over time (and to avoid * overflow) we keep these statistics as a floating * average, which ends up weighing recent refaults * more than old ones. */ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + lruvec_page_state(lruvec, NR_ACTIVE_FILE); if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } void lru_note_cost_page(struct page *page) { lru_note_cost(mem_cgroup_page_lruvec(page), page_is_file_lru(page), thp_nr_pages(page)); } static void __activate_page(struct page *page, struct lruvec *lruvec) { if (!PageActive(page) && !PageUnevictable(page)) { int nr_pages = thp_nr_pages(page); del_page_from_lru_list(page, lruvec); SetPageActive(page); add_page_to_lru_list(page, lruvec); trace_mm_lru_activate(page); __count_vm_events(PGACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } } #ifdef CONFIG_SMP static void activate_page_drain(int cpu) { struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, __activate_page); } static bool need_activate_page_drain(int cpu) { return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } static void activate_page(struct page *page) { page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { struct pagevec *pvec; local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.activate_page); get_page(page); if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, __activate_page); local_unlock(&lru_pvecs.lock); } } #else static inline void activate_page_drain(int cpu) { } static void activate_page(struct page *page) { struct lruvec *lruvec; page = compound_head(page); if (TestClearPageLRU(page)) { lruvec = lock_page_lruvec_irq(page); __activate_page(page, lruvec); unlock_page_lruvec_irq(lruvec); SetPageLRU(page); } } #endif static void __lru_cache_activate_page(struct page *page) { struct pagevec *pvec; int i; local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); /* * Search backwards on the optimistic assumption that the page being * activated has just been added to this pagevec. Note that only * the local pagevec is examined as a !PageLRU page could be in the * process of being released, reclaimed, migrated or on a remote * pagevec that is currently being drained. Furthermore, marking * a remote pagevec's page PageActive potentially hits a race where * a page is marked PageActive just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ for (i = pagevec_count(pvec) - 1; i >= 0; i--) { struct page *pagevec_page = pvec->pages[i]; if (pagevec_page == page) { SetPageActive(page); break; } } local_unlock(&lru_pvecs.lock); } /* * Mark a page as having seen activity. * * inactive,unreferenced -> inactive,referenced * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced * * When a newly allocated page is not yet visible, so safe for non-atomic ops, * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ void mark_page_accessed(struct page *page) { page = compound_head(page); if (!PageReferenced(page)) { SetPageReferenced(page); } else if (PageUnevictable(page)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * evictable page accessed has no effect. */ } else if (!PageActive(page)) { /* * If the page is on the LRU, queue it for activation via * lru_pvecs.activate_page. Otherwise, assume the page is on a * pagevec, mark it active and it'll be moved to the active * LRU on the next drain. */ if (PageLRU(page)) activate_page(page); else __lru_cache_activate_page(page); ClearPageReferenced(page); workingset_activation(page); } if (page_is_idle(page)) clear_page_idle(page); } EXPORT_SYMBOL(mark_page_accessed); /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. * * Queue the page for addition to the LRU via pagevec. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the * pagevec is drained. This gives a chance for the caller of lru_cache_add() * have the page added to the active list using mark_page_accessed(). */ void lru_cache_add(struct page *page) { struct pagevec *pvec; VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); get_page(page); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); if (pagevec_add_and_need_flush(pvec, page)) __pagevec_lru_add(pvec); local_unlock(&lru_pvecs.lock); } EXPORT_SYMBOL(lru_cache_add); /** * lru_cache_add_inactive_or_unevictable * @page: the page to be added to LRU * @vma: vma in which page is mapped for determining reclaimability * * Place @page on the inactive or unevictable LRU list, depending on its * evictability. */ void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { bool unevictable; VM_BUG_ON_PAGE(PageLRU(page), page); unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; if (unlikely(unevictable) && !TestSetPageMlocked(page)) { int nr_pages = thp_nr_pages(page); /* * We use the irq-unsafe __mod_zone_page_state because this * counter is not modified from interrupt context, and the pte * lock is held(spinlock), which implies preemption disabled. */ __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } lru_cache_add(page); } /* * If the page can not be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * * If the page isn't page_mapped and dirty/writeback, the page * could reclaim asap using PG_reclaim. * * 1. active, mapped page -> none * 2. active, dirty/writeback page -> inactive, head, PG_reclaim * 3. inactive, mapped page -> none * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * * In 4, why it moves inactive's head, the VM expects the page would * be write it out by flusher threads as this is much more effective * than the single-page writeout from reclaim. */ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) { bool active = PageActive(page); int nr_pages = thp_nr_pages(page); if (PageUnevictable(page)) return; /* Some processes are using the page */ if (page_mapped(page)) return; del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); if (PageWriteback(page) || PageDirty(page)) { /* * PG_reclaim could be raced with end_page_writeback * It can make readahead confusing. But race window * is _really_ small and it's non-critical problem. */ add_page_to_lru_list(page, lruvec); SetPageReclaim(page); } else { /* * The page's writeback ends up during pagevec * We move that page into tail of inactive. */ add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, nr_pages); } if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) { if (PageActive(page) && !PageUnevictable(page)) { int nr_pages = thp_nr_pages(page); del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); add_page_to_lru_list(page, lruvec); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) { if (PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { int nr_pages = thp_nr_pages(page); del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); /* * Lazyfree pages are clean anonymous pages. They have * PG_swapbacked flag cleared, to distinguish them from normal * anonymous pages */ ClearPageSwapBacked(page); add_page_to_lru_list(page, lruvec); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } } /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); pvec = &per_cpu(lru_rotate.pvec, cpu); /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(pagevec_count(pvec))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&lru_rotate.lock, flags); pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_fn); pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); } /** * deactivate_file_page - forcefully deactivate a file page * @page: page to deactivate * * This function hints the VM that @page is a good reclaim candidate, * for example if its invalidation fails due to the page being dirty * or under writeback. */ void deactivate_file_page(struct page *page) { /* * In a workload with many unevictable page such as mprotect, * unevictable page deactivation for accelerating reclaim is pointless. */ if (PageUnevictable(page)) return; if (likely(get_page_unless_zero(page))) { struct pagevec *pvec; local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); local_unlock(&lru_pvecs.lock); } } /* * deactivate_page - deactivate a page * @page: page to deactivate * * deactivate_page() moves @page to the inactive list if @page was on the active * list and was not an unevictable page. This is done to accelerate the reclaim * of @page. */ void deactivate_page(struct page *page) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { struct pagevec *pvec; local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); get_page(page); if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn); local_unlock(&lru_pvecs.lock); } } /** * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate * * mark_page_lazyfree() moves @page to the inactive file list. * This is done to accelerate the reclaim of @page. */ void mark_page_lazyfree(struct page *page) { if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { struct pagevec *pvec; local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); get_page(page); if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); local_unlock(&lru_pvecs.lock); } } void lru_add_drain(void) { local_lock(&lru_pvecs.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&lru_pvecs.lock); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ static void lru_add_and_bh_lrus_drain(void) { local_lock(&lru_pvecs.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&lru_pvecs.lock); invalidate_bh_lrus_cpu(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&lru_pvecs.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&lru_pvecs.lock); } #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_and_bh_lrus_drain(); } /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is * executed on the offlined cpu. * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number * * (A) Definition: global lru_drain_gen = x implies that all generations * 0 < n <= x are already *scheduled* for draining. * * This is an optimization for the highly-contended use case where a * user space workload keeps constantly generating a flow of pages for * each CPU. */ static unsigned int lru_drain_gen; static struct cpumask has_work; static DEFINE_MUTEX(lock); unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully * initialized. */ if (WARN_ON(!mm_percpu_wq)) return; /* * Guarantee pagevec counter stores visible by this CPU are visible to * other CPUs before loading the current drain generation. */ smp_mb(); /* * (B) Locally cache global LRU draining generation number * * The read barrier ensures that the counter is loaded before the mutex * is taken. It pairs with smp_mb() inside the mutex critical section * at (D). */ this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical * section. Use a full memory barrier to guarantee that the new global * drain generation number is stored before loading pagevec counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (force_all_cpus || pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu) || has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else void lru_add_drain_all(void) { lru_add_drain(); } #endif /* CONFIG_SMP */ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling * a list of pages to be migrated using isolate_lru_page(). * It drains pages on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). */ void lru_cache_disable(void) { atomic_inc(&lru_disable_count); #ifdef CONFIG_SMP /* * lru_add_drain_all in the force mode will schedule draining on * all online CPUs so any calls of lru_cache_disabled wrapped by * local_lock or preemption disabled would be ordered by that. * The atomic operation doesn't need to have stronger ordering * requirements because that is enforeced by the scheduling * guarantees. */ __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); #endif } /** * release_pages - batched put_page() * @pages: array of pages to release * @nr: number of pages * * Decrement the reference count on all the pages in @pages. If it * fell to zero, remove the page from the LRU and free it. */ void release_pages(struct page **pages, int nr) { int i; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags; unsigned int lock_batch; for (i = 0; i < nr; i++) { struct page *page = pages[i]; /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the * same lruvec. The lock is held only if lruvec != NULL. */ if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } page = compound_head(page); if (is_huge_zero_page(page)) continue; if (is_zone_device_page(page)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } /* * ZONE_DEVICE pages that return 'false' from * page_is_devmap_managed() do not require special * processing, and instead, expect a call to * put_page_testzero(). */ if (page_is_devmap_managed(page)) { put_devmap_managed_page(page); continue; } if (put_page_testzero(page)) put_dev_pagemap(page->pgmap); continue; } if (!put_page_testzero(page)) continue; if (PageCompound(page)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } __put_compound_page(page); continue; } if (PageLRU(page)) { struct lruvec *prev_lruvec = lruvec; lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags); if (prev_lruvec != lruvec) lock_batch = 0; del_page_from_lru_list(page, lruvec); __clear_page_lru_flags(page); } __ClearPageWaiters(page); list_add(&page->lru, &pages_to_free); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); } EXPORT_SYMBOL(release_pages); /* * The pages which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's * OK from a correctness point of view but is inefficient - those pages may be * cache-warm and we want to give them back to the page allocator ASAP. * * So __pagevec_release() will drain those queues here. __pagevec_lru_add() * and __pagevec_lru_add_active() call release_pages() directly to avoid * mutual recursion. */ void __pagevec_release(struct pagevec *pvec) { if (!pvec->percpu_pvec_drained) { lru_add_drain(); pvec->percpu_pvec_drained = true; } release_pages(pvec->pages, pagevec_count(pvec)); pagevec_reinit(pvec); } EXPORT_SYMBOL(__pagevec_release); static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) { int was_unevictable = TestClearPageUnevictable(page); int nr_pages = thp_nr_pages(page); VM_BUG_ON_PAGE(PageLRU(page), page); /* * Page becomes evictable in two ways: * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. * 2) Before acquiring LRU lock to put the page to correct LRU and then * a) do PageLRU check with lock [check_move_unevictable_pages] * b) do PageLRU check before lock [clear_page_mlock] * * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need * following strict ordering: * * #0: __pagevec_lru_add_fn #1: clear_page_mlock * * SetPageLRU() TestClearPageMlocked() * smp_mb() // explicit ordering // above provides strict * // ordering * PageMlocked() PageLRU() * * * if '#1' does not observe setting of PG_lru by '#0' and fails * isolation, the explicit barrier will make sure that page_evictable * check will put the page in correct LRU. Without smp_mb(), SetPageLRU * can be reordered after PageMlocked check and can make '#1' to fail * the isolation of the page whose Mlocked bit is cleared (#0 is also * looking at the same page) and the evictable page will be stranded * in an unevictable LRU. */ SetPageLRU(page); smp_mb__after_atomic(); if (page_evictable(page)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { ClearPageActive(page); SetPageUnevictable(page); if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } add_page_to_lru_list(page, lruvec); trace_mm_lru_insertion(page); } /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ void __pagevec_lru_add(struct pagevec *pvec) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags); __pagevec_lru_add_fn(page, lruvec); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } /** * pagevec_remove_exceptionals - pagevec exceptionals pruning * @pvec: The pagevec to prune * * find_get_entries() fills both pages and XArray value entries (aka * exceptional entries) into the pagevec. This function prunes all * exceptionals from @pvec without leaving holes, so that it can be * passed on to page-only pagevec operations. */ void pagevec_remove_exceptionals(struct pagevec *pvec) { int i, j; for (i = 0, j = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; if (!xa_is_value(page)) pvec->pages[j++] = page; } pvec->nr = j; } /** * pagevec_lookup_range - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search * @start: The starting page index * @end: The final page index * * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE * pages in the mapping starting from index @start and upto index @end * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a * reference against the pages in @pvec. * * The search returns a group of mapping-contiguous pages with ascending * indexes. There may be holes in the indices due to not-present pages. We * also update @start to index the next page for the traversal. * * pagevec_lookup_range() returns the number of pages which were found. If this * number is smaller than PAGEVEC_SIZE, the end of specified range has been * reached. */ unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, pgoff_t *start, pgoff_t end) { pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE, pvec->pages); return pagevec_count(pvec); } EXPORT_SYMBOL(pagevec_lookup_range); unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, PAGEVEC_SIZE, pvec->pages); return pagevec_count(pvec); } EXPORT_SYMBOL(pagevec_lookup_range_tag); /* * Perform any setup for the swap system */ void __init swap_setup(void) { unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; else page_cluster = 3; /* * Right now other parts of the system means that we * _really_ don't want to cluster much more */ } #ifdef CONFIG_DEV_PAGEMAP_OPS void put_devmap_managed_page(struct page *page) { int count; if (WARN_ON_ONCE(!page_is_devmap_managed(page))) return; count = page_ref_dec_return(page); /* * devmap page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ if (count == 1) free_devmap_managed_page(page); else if (!count) __put_page(page); } EXPORT_SYMBOL(put_devmap_managed_page); #endif |
2126 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/dsa.h - Driver for Distributed Switch Architecture switch chips * Copyright (c) 2008-2009 Marvell Semiconductor */ #ifndef __LINUX_NET_DSA_H #define __LINUX_NET_DSA_H #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/of.h> #include <linux/ethtool.h> #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/platform_data/dsa.h> #include <linux/phylink.h> #include <net/devlink.h> #include <net/switchdev.h> struct tc_action; struct phy_device; struct fixed_phy_status; struct phylink_link_state; #define DSA_TAG_PROTO_NONE_VALUE 0 #define DSA_TAG_PROTO_BRCM_VALUE 1 #define DSA_TAG_PROTO_BRCM_PREPEND_VALUE 2 #define DSA_TAG_PROTO_DSA_VALUE 3 #define DSA_TAG_PROTO_EDSA_VALUE 4 #define DSA_TAG_PROTO_GSWIP_VALUE 5 #define DSA_TAG_PROTO_KSZ9477_VALUE 6 #define DSA_TAG_PROTO_KSZ9893_VALUE 7 #define DSA_TAG_PROTO_LAN9303_VALUE 8 #define DSA_TAG_PROTO_MTK_VALUE 9 #define DSA_TAG_PROTO_QCA_VALUE 10 #define DSA_TAG_PROTO_TRAILER_VALUE 11 #define DSA_TAG_PROTO_8021Q_VALUE 12 #define DSA_TAG_PROTO_SJA1105_VALUE 13 #define DSA_TAG_PROTO_KSZ8795_VALUE 14 #define DSA_TAG_PROTO_OCELOT_VALUE 15 #define DSA_TAG_PROTO_AR9331_VALUE 16 #define DSA_TAG_PROTO_RTL4_A_VALUE 17 #define DSA_TAG_PROTO_HELLCREEK_VALUE 18 #define DSA_TAG_PROTO_XRS700X_VALUE 19 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 #define DSA_TAG_PROTO_SEVILLE_VALUE 21 #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 #define DSA_TAG_PROTO_SJA1110_VALUE 23 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, DSA_TAG_PROTO_GSWIP = DSA_TAG_PROTO_GSWIP_VALUE, DSA_TAG_PROTO_KSZ9477 = DSA_TAG_PROTO_KSZ9477_VALUE, DSA_TAG_PROTO_KSZ9893 = DSA_TAG_PROTO_KSZ9893_VALUE, DSA_TAG_PROTO_LAN9303 = DSA_TAG_PROTO_LAN9303_VALUE, DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, DSA_TAG_PROTO_AR9331 = DSA_TAG_PROTO_AR9331_VALUE, DSA_TAG_PROTO_RTL4_A = DSA_TAG_PROTO_RTL4_A_VALUE, DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, DSA_TAG_PROTO_SJA1110 = DSA_TAG_PROTO_SJA1110_VALUE, }; struct dsa_switch; struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); unsigned int needed_headroom; unsigned int needed_tailroom; const char *name; enum dsa_tag_protocol proto; /* Some tagging protocols either mangle or shift the destination MAC * address, in which case the DSA master would drop packets on ingress * if what it understands out of the destination MAC address is not in * its RX filter. */ bool promisc_on_master; }; /* This structure defines the control interfaces that are overlayed by the * DSA layer on top of the DSA CPU/management net_device instance. This is * used by the core net_device layer while calling various net_device_ops * function pointers. */ struct dsa_netdevice_ops { int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); }; #define DSA_TAG_DRIVER_ALIAS "dsa_tag-" #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) struct dsa_switch_tree { struct list_head list; /* Notifier chain for switch-wide events */ struct raw_notifier_head nh; /* Tree identifier */ unsigned int index; /* Number of switches attached to this tree */ struct kref refcount; /* Has this tree been applied to the hardware? */ bool setup; /* Tagging protocol operations */ const struct dsa_device_ops *tag_ops; /* Default tagging protocol preferred by the switches in this * tree. */ enum dsa_tag_protocol default_proto; /* * Configuration data for the platform device that owns * this dsa switch tree instance. */ struct dsa_platform_data *pd; /* List of switch ports */ struct list_head ports; /* List of DSA links composing the routing table */ struct list_head rtable; /* Maps offloaded LAG netdevs to a zero-based linear ID for * drivers that need it. */ struct net_device **lags; unsigned int lags_len; /* Track the largest switch index within a tree */ unsigned int last_switch; }; #define dsa_lags_foreach_id(_id, _dst) \ for ((_id) = 0; (_id) < (_dst)->lags_len; (_id)++) \ if ((_dst)->lags[(_id)]) #define dsa_lag_foreach_port(_dp, _dst, _lag) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if ((_dp)->lag_dev == (_lag)) #define dsa_hsr_foreach_port(_dp, _ds, _hsr) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr)) static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst, unsigned int id) { return dst->lags[id]; } static inline int dsa_lag_id(struct dsa_switch_tree *dst, struct net_device *lag) { unsigned int id; dsa_lags_foreach_id(id, dst) { if (dsa_lag_dev(dst, id) == lag) return id; } return -ENODEV; } /* TC matchall action types */ enum dsa_port_mall_action_type { DSA_PORT_MALL_MIRROR, DSA_PORT_MALL_POLICER, }; /* TC mirroring entry */ struct dsa_mall_mirror_tc_entry { u8 to_local_port; bool ingress; }; /* TC port policer entry */ struct dsa_mall_policer_tc_entry { u32 burst; u64 rate_bytes_per_sec; }; /* TC matchall entry */ struct dsa_mall_tc_entry { struct list_head list; unsigned long cookie; enum dsa_port_mall_action_type type; union { struct dsa_mall_mirror_tc_entry mirror; struct dsa_mall_policer_tc_entry policer; }; }; struct dsa_port { /* A CPU port is physically connected to a master device. * A user port exposed to userspace has a slave device. */ union { struct net_device *master; struct net_device *slave; }; /* Copy of the tagging protocol operations, for quicker access * in the data path. Valid only for the CPU ports. */ const struct dsa_device_ops *tag_ops; /* Copies for faster access in master receive hot path */ struct dsa_switch_tree *dst; struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); enum { DSA_PORT_TYPE_UNUSED = 0, DSA_PORT_TYPE_CPU, DSA_PORT_TYPE_DSA, DSA_PORT_TYPE_USER, } type; struct dsa_switch *ds; unsigned int index; const char *name; struct dsa_port *cpu_dp; u8 mac[ETH_ALEN]; struct device_node *dn; unsigned int ageing_time; bool vlan_filtering; /* Managed by DSA on user ports and by drivers on CPU and DSA ports */ bool learning; u8 stp_state; struct net_device *bridge_dev; int bridge_num; struct devlink_port devlink_port; bool devlink_port_setup; struct phylink *pl; struct phylink_config pl_config; struct net_device *lag_dev; bool lag_tx_enabled; struct net_device *hsr_dev; struct list_head list; /* * Give the switch driver somewhere to hang its per-port private data * structures (accessible from the tagger). */ void *priv; /* * Original copy of the master netdev ethtool_ops */ const struct ethtool_ops *orig_ethtool_ops; /* * Original copy of the master netdev net_device_ops */ const struct dsa_netdevice_ops *netdev_ops; /* List of MAC addresses that must be forwarded on this port. * These are only valid on CPU ports and DSA links. */ struct list_head fdbs; struct list_head mdbs; bool setup; }; /* TODO: ideally DSA ports would have a single dp->link_dp member, * and no dst->rtable nor this struct dsa_link would be needed, * but this would require some more complex tree walking, * so keep it stupid at the moment and list them all. */ struct dsa_link { struct dsa_port *dp; struct dsa_port *link_dp; struct list_head list; }; struct dsa_mac_addr { unsigned char addr[ETH_ALEN]; u16 vid; refcount_t refcount; struct list_head list; }; struct dsa_switch { bool setup; struct device *dev; /* * Parent switch tree, and switch index. */ struct dsa_switch_tree *dst; unsigned int index; /* Listener for switch fabric events */ struct notifier_block nb; /* * Give the switch driver somewhere to hang its private data * structure. */ void *priv; /* * Configuration data for this switch. */ struct dsa_chip_data *cd; /* * The switch operations. */ const struct dsa_switch_ops *ops; /* * Slave mii_bus and devices for the individual ports. */ u32 phys_mii_mask; struct mii_bus *slave_mii_bus; /* Ageing Time limits in msecs */ unsigned int ageing_time_min; unsigned int ageing_time_max; /* Storage for drivers using tag_8021q */ struct dsa_8021q_context *tag_8021q_ctx; /* devlink used to represent this switch device */ struct devlink *devlink; /* Number of switch port queues */ unsigned int num_tx_queues; /* Disallow bridge core from requesting different VLAN awareness * settings on ports if not hardware-supported */ bool vlan_filtering_is_global; /* Keep VLAN filtering enabled on ports not offloading any upper. */ bool needs_standalone_vlan_filtering; /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges * that have vlan_filtering=0. All drivers should ideally set this (and * then the option would get removed), but it is unknown whether this * would break things or not. */ bool configure_vlan_while_not_filtering; /* If the switch driver always programs the CPU port as egress tagged * despite the VLAN configuration indicating otherwise, then setting * @untag_bridge_pvid will force the DSA receive path to pop the bridge's * default_pvid VLAN tagged frames to offer a consistent behavior * between a vlan_filtering=0 and vlan_filtering=1 bridge device. */ bool untag_bridge_pvid; /* Let DSA manage the FDB entries towards the CPU, based on the * software bridge database. */ bool assisted_learning_on_cpu_port; /* In case vlan_filtering_is_global is set, the VLAN awareness state * should be retrieved from here and not from the per-port settings. */ bool vlan_filtering; /* MAC PCS does not provide link state change interrupt, and requires * polling. Flag passed on to PHYLINK. */ bool pcs_poll; /* For switches that only have the MRU configurable. To ensure the * configured MTU is not exceeded, normalization of MRU on all bridged * interfaces is needed. */ bool mtu_enforcement_ingress; /* Drivers that benefit from having an ID associated with each * offloaded LAG should set this to the maximum number of * supported IDs. DSA will then maintain a mapping of _at * least_ these many IDs, accessible to drivers via * dsa_lag_id(). */ unsigned int num_lag_ids; /* Drivers that support bridge forwarding offload should set this to * the maximum number of bridges spanning the same switch tree (or all * trees, in the case of cross-tree bridging support) that can be * offloaded. */ unsigned int num_fwd_offloading_bridges; size_t num_ports; }; static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { struct dsa_switch_tree *dst = ds->dst; struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dp->ds == ds && dp->index == p) return dp; return NULL; } static inline bool dsa_port_is_dsa(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_port_is_cpu(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_port_is_user(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_USER; } static inline bool dsa_port_is_unused(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER; } #define dsa_tree_for_each_user_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_port(_dp, _ds) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_safe(_dp, _next, _ds) \ list_for_each_entry_safe((_dp), (_next), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_continue_reverse(_dp, _ds) \ list_for_each_entry_continue_reverse((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_available_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (!dsa_port_is_unused((_dp))) #define dsa_switch_for_each_user_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_cpu_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) static inline u32 dsa_user_ports(struct dsa_switch *ds) { u32 mask = 0; int p; for (p = 0; p < ds->num_ports; p++) if (dsa_is_user_port(ds, p)) mask |= BIT(p); return mask; } /* Return the local port used to reach an arbitrary switch device */ static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device) { struct dsa_switch_tree *dst = ds->dst; struct dsa_link *dl; list_for_each_entry(dl, &dst->rtable, list) if (dl->dp->ds == ds && dl->link_dp->ds->index == device) return dl->dp->index; return ds->num_ports; } /* Return the local port used to reach an arbitrary switch port */ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, int port) { if (device == ds->index) return port; else return dsa_routing_port(ds, device); } /* Return the local port used to reach the dedicated CPU port */ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) { const struct dsa_port *dp = dsa_to_port(ds, port); const struct dsa_port *cpu_dp = dp->cpu_dp; if (!cpu_dp) return port; return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } /* Return true if this is the local port used to reach the CPU port */ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) { if (dsa_is_unused_port(ds, port)) return false; return port == dsa_upstream_port(ds, port); } /* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning * that the routing port from @downstream_ds to @upstream_ds is also the port * which @downstream_ds uses to reach its dedicated CPU. */ static inline bool dsa_switch_is_upstream_of(struct dsa_switch *upstream_ds, struct dsa_switch *downstream_ds) { int routing_port; if (upstream_ds == downstream_ds) return true; routing_port = dsa_routing_port(downstream_ds, upstream_ds->index); return dsa_is_upstream_port(downstream_ds, routing_port); } static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) { const struct dsa_switch *ds = dp->ds; if (ds->vlan_filtering_is_global) return ds->vlan_filtering; else return dp->vlan_filtering; } static inline struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) { if (!dp->bridge_dev) return NULL; if (dp->lag_dev) return dp->lag_dev; else if (dp->hsr_dev) return dp->hsr_dev; return dp->slave; } typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { /* * Tagging protocol helpers called for the CPU ports and DSA links. * @get_tag_protocol retrieves the initial tagging protocol and is * mandatory. Switches which can operate using multiple tagging * protocols should implement @change_tag_protocol and report in * @get_tag_protocol the tagger in current use. */ enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol mprot); int (*change_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol proto); /* Optional switch-wide initialization and destruction methods */ int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); /* Per-port initialization and destruction methods. Mandatory if the * driver registers devlink port regions, optional otherwise. */ int (*port_setup)(struct dsa_switch *ds, int port); void (*port_teardown)(struct dsa_switch *ds, int port); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* * Access to the switch's PHY registers. */ int (*phy_read)(struct dsa_switch *ds, int port, int regnum); int (*phy_write)(struct dsa_switch *ds, int port, int regnum, u16 val); /* * Link state adjustment (called from libphy) */ void (*adjust_link)(struct dsa_switch *ds, int port, struct phy_device *phydev); void (*fixed_link_update)(struct dsa_switch *ds, int port, struct fixed_phy_status *st); /* * PHYLINK integration */ void (*phylink_validate)(struct dsa_switch *ds, int port, unsigned long *supported, struct phylink_link_state *state); int (*phylink_mac_link_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); void (*phylink_mac_config)(struct dsa_switch *ds, int port, unsigned int mode, const struct phylink_link_state *state); void (*phylink_mac_an_restart)(struct dsa_switch *ds, int port); void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface); void (*phylink_mac_link_up)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface, struct phy_device *phydev, int speed, int duplex, bool tx_pause, bool rx_pause); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* * Port statistics counters. */ void (*get_strings)(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data); void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); void (*get_ethtool_phy_stats)(struct dsa_switch *ds, int port, uint64_t *data); void (*get_stats64)(struct dsa_switch *ds, int port, struct rtnl_link_stats64 *s); void (*self_test)(struct dsa_switch *ds, int port, struct ethtool_test *etest, u64 *data); /* * ethtool Wake-on-LAN */ void (*get_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); int (*set_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); /* * ethtool timestamp info */ int (*get_ts_info)(struct dsa_switch *ds, int port, struct ethtool_ts_info *ts); /* * Suspend and resume */ int (*suspend)(struct dsa_switch *ds); int (*resume)(struct dsa_switch *ds); /* * Port enable/disable */ int (*port_enable)(struct dsa_switch *ds, int port, struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port); /* * Compatibility between device trees defining multiple CPU ports and * drivers which are not OK to use by default the numerically smallest * CPU port of a switch for its local ports. This can return NULL, * meaning "don't know/don't care". */ struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds); /* * Port's MAC EEE settings */ int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_eee *e); int (*get_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_eee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); int (*get_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); int (*set_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); /* * Register access. */ int (*get_regs_len)(struct dsa_switch *ds, int port); void (*get_regs)(struct dsa_switch *ds, int port, struct ethtool_regs *regs, void *p); /* * Upper device tracking. */ int (*port_prechangeupper)(struct dsa_switch *ds, int port, struct netdev_notifier_changeupper_info *info); /* * Bridge integration */ int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, struct net_device *bridge); void (*port_bridge_leave)(struct dsa_switch *ds, int port, struct net_device *bridge); /* Called right after .port_bridge_join() */ int (*port_bridge_tx_fwd_offload)(struct dsa_switch *ds, int port, struct net_device *bridge, int bridge_num); /* Called right before .port_bridge_leave() */ void (*port_bridge_tx_fwd_unoffload)(struct dsa_switch *ds, int port, struct net_device *bridge, int bridge_num); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); void (*port_fast_age)(struct dsa_switch *ds, int port); int (*port_pre_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); int (*port_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); /* * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering, struct netlink_ext_ack *extack); int (*port_vlan_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); /* * Forwarding database */ int (*port_fdb_add)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid); int (*port_fdb_del)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); /* * Multicast database */ int (*port_mdb_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb); int (*port_mdb_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb); /* * RXNFC */ int (*get_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc, u32 *rule_locs); int (*set_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc); /* * TC integration */ int (*cls_flower_add)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_del)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_stats)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*port_mirror_add)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int (*port_policer_add)(struct dsa_switch *ds, int port, struct dsa_mall_policer_tc_entry *policer); void (*port_policer_del)(struct dsa_switch *ds, int port); int (*port_setup_tc)(struct dsa_switch *ds, int port, enum tc_setup_type type, void *type_data); /* * Cross-chip operations */ int (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct net_device *br); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct net_device *br); int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, int port, struct net_device *lag, struct netdev_lag_upper_info *info); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, int port, struct net_device *lag); /* * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, struct ifreq *ifr); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, struct ifreq *ifr); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); /* Devlink parameters, etc */ int (*devlink_param_get)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_param_set)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_info_get)(struct dsa_switch *ds, struct devlink_info_req *req, struct netlink_ext_ack *extack); int (*devlink_sb_pool_get)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, struct devlink_sb_pool_info *pool_info); int (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, u32 size, enum devlink_sb_threshold_type threshold_type, struct netlink_ext_ack *extack); int (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_threshold); int (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 *p_pool_index, u32 *p_threshold); int (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_occ_snapshot)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_max_clear)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_cur, u32 *p_max); int (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u32 *p_cur, u32 *p_max); /* * MTU change functionality. Switches can also adjust their MRU through * this method. By MTU, one understands the SDU (L2 payload) length. * If the switch needs to account for the DSA tag on the CPU port, this * method needs to do so privately. */ int (*port_change_mtu)(struct dsa_switch *ds, int port, int new_mtu); int (*port_max_mtu)(struct dsa_switch *ds, int port); /* * LAG integration */ int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, struct net_device *lag, struct netdev_lag_upper_info *info); int (*port_lag_leave)(struct dsa_switch *ds, int port, struct net_device *lag); /* * HSR integration */ int (*port_hsr_join)(struct dsa_switch *ds, int port, struct net_device *hsr); int (*port_hsr_leave)(struct dsa_switch *ds, int port, struct net_device *hsr); /* * MRP integration */ int (*port_mrp_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); /* * tag_8021q operations */ int (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes, \ dsa_devlink_param_get, dsa_devlink_param_set, NULL) int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx); int dsa_devlink_param_set(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx); int dsa_devlink_params_register(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); void dsa_devlink_params_unregister(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); int dsa_devlink_resource_register(struct dsa_switch *ds, const char *resource_name, u64 resource_size, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params); void dsa_devlink_resources_unregister(struct dsa_switch *ds); void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv); void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, u64 resource_id); struct devlink_region * dsa_devlink_region_create(struct dsa_switch *ds, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size); struct devlink_region * dsa_devlink_port_region_create(struct dsa_switch *ds, int port, const struct devlink_port_region_ops *ops, u32 region_max_snapshots, u64 region_size); void dsa_devlink_region_destroy(struct devlink_region *region); struct dsa_port *dsa_port_from_netdev(struct net_device *netdev); struct dsa_devlink_priv { struct dsa_switch *ds; }; static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl) { struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline struct dsa_switch *dsa_devlink_port_to_ds(struct devlink_port *port) { struct devlink *dl = port->devlink; struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline int dsa_devlink_port_to_port(struct devlink_port *port) { return port->index; } struct dsa_switch_driver { struct list_head list; const struct dsa_switch_ops *ops; }; struct net_device *dsa_dev_to_net_device(struct device *dev); /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { #if IS_ENABLED(CONFIG_NET_DSA) return dev->dsa_ptr && dev->dsa_ptr->rcv; #endif return false; } /* All DSA tags that push the EtherType to the right (basically all except tail * tags, which don't break dissection) can be treated the same from the * perspective of the flow dissector. * * We need to return: * - offset: the (B - A) difference between: * A. the position of the real EtherType and * B. the current skb->data (aka ETH_HLEN bytes into the frame, aka 2 bytes * after the normal EtherType was supposed to be) * The offset in bytes is exactly equal to the tagger overhead (and half of * that, in __be16 shorts). * * - proto: the value of the real EtherType. */ static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb, __be16 *proto, int *offset) { #if IS_ENABLED(CONFIG_NET_DSA) const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops; int tag_len = ops->needed_headroom; *offset = tag_len; *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1]; #endif } #if IS_ENABLED(CONFIG_NET_DSA) static inline int __dsa_netdevice_ops_check(struct net_device *dev) { int err = -EOPNOTSUPP; if (!dev->dsa_ptr) return err; if (!dev->dsa_ptr->netdev_ops) return err; return 0; } static inline int dsa_ndo_eth_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { const struct dsa_netdevice_ops *ops; int err; err = __dsa_netdevice_ops_check(dev); if (err) return err; ops = dev->dsa_ptr->netdev_ops; return ops->ndo_eth_ioctl(dev, ifr, cmd); } #else static inline int dsa_ndo_eth_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { return -EOPNOTSUPP; } #endif void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); #else static inline int dsa_switch_suspend(struct dsa_switch *ds) { return 0; } static inline int dsa_switch_resume(struct dsa_switch *ds) { return 0; } #endif /* CONFIG_PM_SLEEP */ #if IS_ENABLED(CONFIG_NET_DSA) bool dsa_slave_dev_check(const struct net_device *dev); #else static inline bool dsa_slave_dev_check(const struct net_device *dev) { return false; } #endif netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data); int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); int dsa_port_get_phy_sset_count(struct dsa_port *dp); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); struct dsa_tag_driver { const struct dsa_device_ops *ops; struct list_head list; struct module *owner; }; void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count, struct module *owner); void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count); #define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ static int __init dsa_tag_driver_module_init(void) \ { \ dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ THIS_MODULE); \ return 0; \ } \ module_init(dsa_tag_driver_module_init); \ \ static void __exit dsa_tag_driver_module_exit(void) \ { \ dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ } \ module_exit(dsa_tag_driver_module_exit) /** * module_dsa_tag_drivers() - Helper macro for registering DSA tag * drivers * @__ops_array: Array of tag driver strucutres * * Helper macro for DSA tag drivers which do not do anything special * in module init/exit. Each module may only use this macro once, and * calling it replaces module_init() and module_exit(). */ #define module_dsa_tag_drivers(__ops_array) \ dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) #define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops /* Create a static structure we can build a linked list of dsa_tag * drivers */ #define DSA_TAG_DRIVER(__ops) \ static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ .ops = &__ops, \ } /** * module_dsa_tag_driver() - Helper macro for registering a single DSA tag * driver * @__ops: Single tag driver structures * * Helper macro for DSA tag drivers which do not do anything special * in module init/exit. Each module may only use this macro once, and * calling it replaces module_init() and module_exit(). */ #define module_dsa_tag_driver(__ops) \ DSA_TAG_DRIVER(__ops); \ \ static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ &DSA_TAG_DRIVER_NAME(__ops) \ }; \ module_dsa_tag_drivers(dsa_tag_driver_array) #endif |
890 889 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 | // SPDX-License-Identifier: GPL-2.0-or-later /* * "LAPB via ethernet" driver release 001 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * This is a "pseudo" network driver to allow LAPB over Ethernet. * * This driver can use any ethernet destination address, and can be * limited to accept frames from one dedicated ethernet card only. * * History * LAPBETH 001 Jonathan Naylor Cloned from bpqether.c * 2000-10-29 Henner Eisen lapb_data_indication() return status. * 2000-11-14 Henner Eisen dev_hold/put, NETDEV_GOING_DOWN support */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/stat.h> #include <linux/module.h> #include <linux/lapb.h> #include <linux/init.h> #include <net/x25device.h> static const u8 bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; /* If this number is made larger, check that the temporary string buffer * in lapbeth_new_device is large enough to store the probe device name. */ #define MAXLAPBDEV 100 struct lapbethdev { struct list_head node; struct net_device *ethdev; /* link to ethernet device */ struct net_device *axdev; /* lapbeth device (lapb#) */ bool up; spinlock_t up_lock; /* Protects "up" */ struct sk_buff_head rx_queue; struct napi_struct napi; }; static LIST_HEAD(lapbeth_devices); static void lapbeth_connected(struct net_device *dev, int reason); static void lapbeth_disconnected(struct net_device *dev, int reason); /* ------------------------------------------------------------------------ */ /* Get the LAPB device for the ethernet device */ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev) { struct lapbethdev *lapbeth; list_for_each_entry_rcu(lapbeth, &lapbeth_devices, node, lockdep_rtnl_is_held()) { if (lapbeth->ethdev == dev) return lapbeth; } return NULL; } static __inline__ int dev_is_ethdev(struct net_device *dev) { return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5); } /* ------------------------------------------------------------------------ */ static int lapbeth_napi_poll(struct napi_struct *napi, int budget) { struct lapbethdev *lapbeth = container_of(napi, struct lapbethdev, napi); struct sk_buff *skb; int processed = 0; for (; processed < budget; ++processed) { skb = skb_dequeue(&lapbeth->rx_queue); if (!skb) break; netif_receive_skb_core(skb); } if (processed < budget) napi_complete(napi); return processed; } /* Receive a LAPB frame via an ethernet interface. */ static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len, err; struct lapbethdev *lapbeth; if (dev_net(dev) != &init_net) goto drop; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (!pskb_may_pull(skb, 2)) goto drop; rcu_read_lock(); lapbeth = lapbeth_get_x25_dev(dev); if (!lapbeth) goto drop_unlock_rcu; spin_lock_bh(&lapbeth->up_lock); if (!lapbeth->up) goto drop_unlock; len = skb->data[0] + skb->data[1] * 256; dev->stats.rx_packets++; dev->stats.rx_bytes += len; skb_pull(skb, 2); /* Remove the length bytes */ skb_trim(skb, len); /* Set the length of the data */ err = lapb_data_received(lapbeth->axdev, skb); if (err != LAPB_OK) { printk(KERN_DEBUG "lapbether: lapb_data_received err - %d\n", err); goto drop_unlock; } out: spin_unlock_bh(&lapbeth->up_lock); rcu_read_unlock(); return 0; drop_unlock: kfree_skb(skb); goto out; drop_unlock_rcu: rcu_read_unlock(); drop: kfree_skb(skb); return 0; } static int lapbeth_data_indication(struct net_device *dev, struct sk_buff *skb) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; if (skb_cow(skb, 1)) { kfree_skb(skb); return NET_RX_DROP; } skb_push(skb, 1); ptr = skb->data; *ptr = X25_IFACE_DATA; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); return NET_RX_SUCCESS; } /* Send a LAPB frame via an ethernet interface */ static netdev_tx_t lapbeth_xmit(struct sk_buff *skb, struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; spin_lock_bh(&lapbeth->up_lock); if (!lapbeth->up) goto drop; /* There should be a pseudo header of 1 byte added by upper layers. * Check to make sure it is there before reading it. */ if (skb->len < 1) goto drop; switch (skb->data[0]) { case X25_IFACE_DATA: break; case X25_IFACE_CONNECT: err = lapb_connect_request(dev); if (err == LAPB_CONNECTED) lapbeth_connected(dev, LAPB_OK); else if (err != LAPB_OK) pr_err("lapb_connect_request error: %d\n", err); goto drop; case X25_IFACE_DISCONNECT: err = lapb_disconnect_request(dev); if (err == LAPB_NOTCONNECTED) lapbeth_disconnected(dev, LAPB_OK); else if (err != LAPB_OK) pr_err("lapb_disconnect_request err: %d\n", err); fallthrough; default: goto drop; } skb_pull(skb, 1); err = lapb_data_request(dev, skb); if (err != LAPB_OK) { pr_err("lapb_data_request error - %d\n", err); goto drop; } out: spin_unlock_bh(&lapbeth->up_lock); return NETDEV_TX_OK; drop: kfree_skb(skb); goto out; } static void lapbeth_data_transmit(struct net_device *ndev, struct sk_buff *skb) { struct lapbethdev *lapbeth = netdev_priv(ndev); unsigned char *ptr; struct net_device *dev; int size = skb->len; ptr = skb_push(skb, 2); *ptr++ = size % 256; *ptr++ = size / 256; ndev->stats.tx_packets++; ndev->stats.tx_bytes += size; skb->dev = dev = lapbeth->ethdev; skb->protocol = htons(ETH_P_DEC); skb_reset_network_header(skb); dev_hard_header(skb, dev, ETH_P_DEC, bcast_addr, NULL, 0); dev_queue_xmit(skb); } static void lapbeth_connected(struct net_device *dev, int reason) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; struct sk_buff *skb = __dev_alloc_skb(1, GFP_ATOMIC | __GFP_NOMEMALLOC); if (!skb) return; ptr = skb_put(skb, 1); *ptr = X25_IFACE_CONNECT; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); } static void lapbeth_disconnected(struct net_device *dev, int reason) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; struct sk_buff *skb = __dev_alloc_skb(1, GFP_ATOMIC | __GFP_NOMEMALLOC); if (!skb) return; ptr = skb_put(skb, 1); *ptr = X25_IFACE_DISCONNECT; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); } /* Set AX.25 callsign */ static int lapbeth_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); return 0; } static const struct lapb_register_struct lapbeth_callbacks = { .connect_confirmation = lapbeth_connected, .connect_indication = lapbeth_connected, .disconnect_confirmation = lapbeth_disconnected, .disconnect_indication = lapbeth_disconnected, .data_indication = lapbeth_data_indication, .data_transmit = lapbeth_data_transmit, }; /* open/close a device */ static int lapbeth_open(struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; napi_enable(&lapbeth->napi); err = lapb_register(dev, &lapbeth_callbacks); if (err != LAPB_OK) { napi_disable(&lapbeth->napi); pr_err("lapb_register error: %d\n", err); return -ENODEV; } spin_lock_bh(&lapbeth->up_lock); lapbeth->up = true; spin_unlock_bh(&lapbeth->up_lock); return 0; } static int lapbeth_close(struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; spin_lock_bh(&lapbeth->up_lock); lapbeth->up = false; spin_unlock_bh(&lapbeth->up_lock); err = lapb_unregister(dev); if (err != LAPB_OK) pr_err("lapb_unregister error: %d\n", err); napi_disable(&lapbeth->napi); return 0; } /* ------------------------------------------------------------------------ */ static const struct net_device_ops lapbeth_netdev_ops = { .ndo_open = lapbeth_open, .ndo_stop = lapbeth_close, .ndo_start_xmit = lapbeth_xmit, .ndo_set_mac_address = lapbeth_set_mac_address, }; static void lapbeth_setup(struct net_device *dev) { dev->netdev_ops = &lapbeth_netdev_ops; dev->needs_free_netdev = true; dev->type = ARPHRD_X25; dev->hard_header_len = 0; dev->mtu = 1000; dev->addr_len = 0; } /* Setup a new device. */ static int lapbeth_new_device(struct net_device *dev) { struct net_device *ndev; struct lapbethdev *lapbeth; int rc = -ENOMEM; ASSERT_RTNL(); if (dev->type != ARPHRD_ETHER) return -EINVAL; ndev = alloc_netdev(sizeof(*lapbeth), "lapb%d", NET_NAME_UNKNOWN, lapbeth_setup); if (!ndev) goto out; /* When transmitting data: * first this driver removes a pseudo header of 1 byte, * then the lapb module prepends an LAPB header of at most 3 bytes, * then this driver prepends a length field of 2 bytes, * then the underlying Ethernet device prepends its own header. */ ndev->needed_headroom = -1 + 3 + 2 + dev->hard_header_len + dev->needed_headroom; ndev->needed_tailroom = dev->needed_tailroom; lapbeth = netdev_priv(ndev); lapbeth->axdev = ndev; dev_hold(dev); lapbeth->ethdev = dev; lapbeth->up = false; spin_lock_init(&lapbeth->up_lock); skb_queue_head_init(&lapbeth->rx_queue); netif_napi_add(ndev, &lapbeth->napi, lapbeth_napi_poll, 16); rc = -EIO; if (register_netdevice(ndev)) goto fail; list_add_rcu(&lapbeth->node, &lapbeth_devices); rc = 0; out: return rc; fail: dev_put(dev); free_netdev(ndev); goto out; } /* Free a lapb network device. */ static void lapbeth_free_device(struct lapbethdev *lapbeth) { dev_put(lapbeth->ethdev); list_del_rcu(&lapbeth->node); unregister_netdevice(lapbeth->axdev); } /* Handle device status changes. * * Called from notifier with RTNL held. */ static int lapbeth_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct lapbethdev *lapbeth; struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev_net(dev) != &init_net) return NOTIFY_DONE; if (!dev_is_ethdev(dev) && !lapbeth_get_x25_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: /* New ethernet device -> new LAPB interface */ if (!lapbeth_get_x25_dev(dev)) lapbeth_new_device(dev); break; case NETDEV_GOING_DOWN: /* ethernet device closes -> close LAPB interface */ lapbeth = lapbeth_get_x25_dev(dev); if (lapbeth) dev_close(lapbeth->axdev); break; case NETDEV_UNREGISTER: /* ethernet device disappears -> remove LAPB interface */ lapbeth = lapbeth_get_x25_dev(dev); if (lapbeth) lapbeth_free_device(lapbeth); break; } return NOTIFY_DONE; } /* ------------------------------------------------------------------------ */ static struct packet_type lapbeth_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DEC), .func = lapbeth_rcv, }; static struct notifier_block lapbeth_dev_notifier = { .notifier_call = lapbeth_device_event, }; static const char banner[] __initconst = KERN_INFO "LAPB Ethernet driver version 0.02\n"; static int __init lapbeth_init_driver(void) { dev_add_pack(&lapbeth_packet_type); register_netdevice_notifier(&lapbeth_dev_notifier); printk(banner); return 0; } module_init(lapbeth_init_driver); static void __exit lapbeth_cleanup_driver(void) { struct lapbethdev *lapbeth; struct list_head *entry, *tmp; dev_remove_pack(&lapbeth_packet_type); unregister_netdevice_notifier(&lapbeth_dev_notifier); rtnl_lock(); list_for_each_safe(entry, tmp, &lapbeth_devices) { lapbeth = list_entry(entry, struct lapbethdev, node); dev_put(lapbeth->ethdev); unregister_netdevice(lapbeth->axdev); } rtnl_unlock(); } module_exit(lapbeth_cleanup_driver); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The unofficial LAPB over Ethernet driver"); MODULE_LICENSE("GPL"); |
707 707 707 706 707 704 326 326 324 326 706 706 706 706 703 705 489 489 705 706 706 706 488 686 707 706 705 706 707 706 706 706 706 704 706 686 705 705 704 707 706 704 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 | // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ #include <linux/kernel.h> #ifdef __KERNEL__ #include <linux/string.h> #else #include <string.h> #endif #include <asm/inat.h> /*__ignore_sync_check__ */ #include <asm/insn.h> /* __ignore_sync_check__ */ #include <asm/unaligned.h> /* __ignore_sync_check__ */ #include <linux/errno.h> #include <linux/kconfig.h> #include <asm/emulate_prefix.h> /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ __typeof__(t) v; \ switch (sizeof(t)) { \ case 4: v = le32_to_cpu(r); break; \ case 2: v = le16_to_cpu(r); break; \ case 1: v = r; break; \ default: \ BUILD_BUG(); break; \ } \ v; \ }) /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ ({ t r = get_unaligned((t *)(insn)->next_byte); (insn)->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ ({ t r = get_unaligned((t *)(insn)->next_byte + n); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) #define peek_nbyte_next(t, insn, n) \ ({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); }) #define peek_next(t, insn) peek_nbyte_next(t, insn, 0) /** * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @buf_len: length of the insn buffer at @kaddr * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { /* * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid * even if the input buffer is long enough to hold them. */ if (buf_len > MAX_INSN_SIZE) buf_len = MAX_INSN_SIZE; memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; insn->end_kaddr = kaddr + buf_len; insn->next_byte = kaddr; insn->x86_64 = x86_64 ? 1 : 0; insn->opnd_bytes = 4; if (x86_64) insn->addr_bytes = 8; else insn->addr_bytes = 4; } static const insn_byte_t xen_prefix[] = { __XEN_EMULATE_PREFIX }; static const insn_byte_t kvm_prefix[] = { __KVM_EMULATE_PREFIX }; static int __insn_get_emulate_prefix(struct insn *insn, const insn_byte_t *prefix, size_t len) { size_t i; for (i = 0; i < len; i++) { if (peek_nbyte_next(insn_byte_t, insn, i) != prefix[i]) goto err_out; } insn->emulate_prefix_size = len; insn->next_byte += len; return 1; err_out: return 0; } static void insn_get_emulate_prefix(struct insn *insn) { if (__insn_get_emulate_prefix(insn, xen_prefix, sizeof(xen_prefix))) return; __insn_get_emulate_prefix(insn, kvm_prefix, sizeof(kvm_prefix)); } /** * insn_get_prefixes - scan x86 instruction prefix bytes * @insn: &struct insn containing instruction * * Populates the @insn->prefixes bitmap, and updates @insn->next_byte * to point to the (first) opcode. No effect if @insn->prefixes.got * is already set. * * * Returns: * 0: on success * < 0: on error */ int insn_get_prefixes(struct insn *insn) { struct insn_field *prefixes = &insn->prefixes; insn_attr_t attr; insn_byte_t b, lb; int i, nb; if (prefixes->got) return 0; insn_get_emulate_prefix(insn); nb = 0; lb = 0; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); while (inat_is_legacy_prefix(attr)) { /* Skip if same prefix */ for (i = 0; i < nb; i++) if (prefixes->bytes[i] == b) goto found; if (nb == 4) /* Invalid instruction */ break; prefixes->bytes[nb++] = b; if (inat_is_address_size_prefix(attr)) { /* address size switches 2/4 or 4/8 */ if (insn->x86_64) insn->addr_bytes ^= 12; else insn->addr_bytes ^= 6; } else if (inat_is_operand_size_prefix(attr)) { /* oprand size switches 2/4 */ insn->opnd_bytes ^= 6; } found: prefixes->nbytes++; insn->next_byte++; lb = b; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); } /* Set the last prefix */ if (lb && lb != insn->prefixes.bytes[3]) { if (unlikely(insn->prefixes.bytes[3])) { /* Swap the last prefix */ b = insn->prefixes.bytes[3]; for (i = 0; i < nb; i++) if (prefixes->bytes[i] == lb) insn_set_byte(prefixes, i, b); } insn_set_byte(&insn->prefixes, 3, lb); } /* Decode REX prefix */ if (insn->x86_64) { b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_rex_prefix(attr)) { insn_field_set(&insn->rex_prefix, b, 1); insn->next_byte++; if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; } } insn->rex_prefix.got = 1; /* Decode VEX prefix */ b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_vex_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is * LDS or LES or BOUND. */ if (X86_MODRM_MOD(b2) != 3) goto vex_end; } insn_set_byte(&insn->vex_prefix, 0, b); insn_set_byte(&insn->vex_prefix, 1, b2); if (inat_is_evex_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); b2 = peek_nbyte_next(insn_byte_t, insn, 3); insn_set_byte(&insn->vex_prefix, 3, b2); insn->vex_prefix.nbytes = 4; insn->next_byte += 4; if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else if (inat_is_vex3_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* * For VEX2, fake VEX3-like byte#2. * Makes it easier to decode vex.W, vex.vvvv, * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. */ insn_set_byte(&insn->vex_prefix, 2, b2 & 0x7f); insn->vex_prefix.nbytes = 2; insn->next_byte += 2; } } vex_end: insn->vex_prefix.got = 1; prefixes->got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_opcode - collect opcode(s) * @insn: &struct insn containing instruction * * Populates @insn->opcode, updates @insn->next_byte to point past the * opcode byte(s), and set @insn->attr (except for groups). * If necessary, first collects any preceding (prefix) bytes. * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got * is already 1. * * Returns: * 0: on success * < 0: on error */ int insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; int pfx_id, ret; insn_byte_t op; if (opcode->got) return 0; if (!insn->prefixes.got) { ret = insn_get_prefixes(insn); if (ret) return ret; } /* Get first opcode */ op = get_next(insn_byte_t, insn); insn_set_byte(opcode, 0, op); opcode->nbytes = 1; /* Check if there is VEX prefix or not */ if (insn_is_avx(insn)) { insn_byte_t m, p; m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))) { /* This instruction is bad */ insn->attr = 0; return -EINVAL; } /* VEX has only 1 byte for opcode */ goto end; } insn->attr = inat_get_opcode_attribute(op); while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ op = get_next(insn_byte_t, insn); opcode->bytes[opcode->nbytes++] = op; pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); } if (inat_must_vex(insn->attr)) { /* This instruction is bad */ insn->attr = 0; return -EINVAL; } end: opcode->got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_modrm - collect ModRM byte, if any * @insn: &struct insn containing instruction * * Populates @insn->modrm and updates @insn->next_byte to point past the * ModRM byte, if any. If necessary, first collects the preceding bytes * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. * * Returns: * 0: on success * < 0: on error */ int insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; insn_byte_t pfx_id, mod; int ret; if (modrm->got) return 0; if (!insn->opcode.got) { ret = insn_get_opcode(insn); if (ret) return ret; } if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); insn_field_set(modrm, mod, 1); if (inat_is_group(insn->attr)) { pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; } } } if (insn->x86_64 && inat_is_force64(insn->attr)) insn->opnd_bytes = 8; modrm->got = 1; return 0; err_out: return -ENODATA; } /** * insn_rip_relative() - Does instruction use RIP-relative addressing mode? * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. No effect if @insn->x86_64 is 0. */ int insn_rip_relative(struct insn *insn) { struct insn_field *modrm = &insn->modrm; int ret; if (!insn->x86_64) return 0; if (!modrm->got) { ret = insn_get_modrm(insn); if (ret) return 0; } /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. */ return (modrm->nbytes && (modrm->bytes[0] & 0xc7) == 0x5); } /** * insn_get_sib() - Get the SIB byte of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_get_sib(struct insn *insn) { insn_byte_t modrm; int ret; if (insn->sib.got) return 0; if (!insn->modrm.got) { ret = insn_get_modrm(insn); if (ret) return ret; } if (insn->modrm.nbytes) { modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { insn_field_set(&insn->sib, get_next(insn_byte_t, insn), 1); } } insn->sib.got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_displacement() - Get the displacement of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * SIB byte. * Displacement value is sign-expanded. * * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_get_displacement(struct insn *insn) { insn_byte_t mod, rm, base; int ret; if (insn->displacement.got) return 0; if (!insn->sib.got) { ret = insn_get_sib(insn); if (ret) return ret; } if (insn->modrm.nbytes) { /* * Interpreting the modrm byte: * mod = 00 - no displacement fields (exceptions below) * mod = 01 - 1-byte displacement field * mod = 10 - displacement field is 4 bytes, or 2 bytes if * address size = 2 (0x67 prefix in 32-bit mode) * mod = 11 - no memory operand * * If address size = 2... * mod = 00, r/m = 110 - displacement field is 2 bytes * * If address size != 2... * mod != 11, r/m = 100 - SIB byte exists * mod = 00, SIB base = 101 - displacement field is 4 bytes * mod = 00, r/m = 101 - rip-relative addressing, displacement * field is 4 bytes */ mod = X86_MODRM_MOD(insn->modrm.value); rm = X86_MODRM_RM(insn->modrm.value); base = X86_SIB_BASE(insn->sib.value); if (mod == 3) goto out; if (mod == 1) { insn_field_set(&insn->displacement, get_next(signed char, insn), 1); } else if (insn->addr_bytes == 2) { if ((mod == 0 && rm == 6) || mod == 2) { insn_field_set(&insn->displacement, get_next(short, insn), 2); } } else { if ((mod == 0 && rm == 5) || mod == 2 || (mod == 0 && base == 5)) { insn_field_set(&insn->displacement, get_next(int, insn), 4); } } } out: insn->displacement.got = 1; return 0; err_out: return -ENODATA; } /* Decode moffset16/32/64. Return 0 if failed */ static int __get_moffset(struct insn *insn) { switch (insn->addr_bytes) { case 2: insn_field_set(&insn->moffset1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->moffset1, get_next(int, insn), 4); break; case 8: insn_field_set(&insn->moffset1, get_next(int, insn), 4); insn_field_set(&insn->moffset2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } insn->moffset1.got = insn->moffset2.got = 1; return 1; err_out: return 0; } /* Decode imm v32(Iz). Return 0 if failed */ static int __get_immv32(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case 4: case 8: insn_field_set(&insn->immediate, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } return 1; err_out: return 0; } /* Decode imm v64(Iv/Ov), Return 0 if failed */ static int __get_immv(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn->immediate1.nbytes = 4; break; case 8: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } insn->immediate1.got = insn->immediate2.got = 1; return 1; err_out: return 0; } /* Decode ptr16:16/32(Ap) */ static int __get_immptr(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->immediate1, get_next(int, insn), 4); break; case 8: /* ptr16:64 is not exist (no segment) */ return 0; default: /* opnd_bytes must be modified manually */ goto err_out; } insn_field_set(&insn->immediate2, get_next(unsigned short, insn), 2); insn->immediate1.got = insn->immediate2.got = 1; return 1; err_out: return 0; } /** * insn_get_immediate() - Get the immediate in an instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * displacement bytes. * Basically, most of immediates are sign-expanded. Unsigned-value can be * computed by bit masking with ((1 << (nbytes * 8)) - 1) * * Returns: * 0: on success * < 0: on error */ int insn_get_immediate(struct insn *insn) { int ret; if (insn->immediate.got) return 0; if (!insn->displacement.got) { ret = insn_get_displacement(insn); if (ret) return ret; } if (inat_has_moffset(insn->attr)) { if (!__get_moffset(insn)) goto err_out; goto done; } if (!inat_has_immediate(insn->attr)) /* no immediates */ goto done; switch (inat_immediate_size(insn->attr)) { case INAT_IMM_BYTE: insn_field_set(&insn->immediate, get_next(signed char, insn), 1); break; case INAT_IMM_WORD: insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case INAT_IMM_DWORD: insn_field_set(&insn->immediate, get_next(int, insn), 4); break; case INAT_IMM_QWORD: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; case INAT_IMM_PTR: if (!__get_immptr(insn)) goto err_out; break; case INAT_IMM_VWORD32: if (!__get_immv32(insn)) goto err_out; break; case INAT_IMM_VWORD: if (!__get_immv(insn)) goto err_out; break; default: /* Here, insn must have an immediate, but failed */ goto err_out; } if (inat_has_second_immediate(insn->attr)) { insn_field_set(&insn->immediate2, get_next(signed char, insn), 1); } done: insn->immediate.got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_length() - Get the length of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * immediates bytes. * * Returns: * - 0 on success * - < 0 on error */ int insn_get_length(struct insn *insn) { int ret; if (insn->length) return 0; if (!insn->immediate.got) { ret = insn_get_immediate(insn); if (ret) return ret; } insn->length = (unsigned char)((unsigned long)insn->next_byte - (unsigned long)insn->kaddr); return 0; } /* Ensure this instruction is decoded completely */ static inline int insn_complete(struct insn *insn) { return insn->opcode.got && insn->modrm.got && insn->sib.got && insn->displacement.got && insn->immediate.got; } /** * insn_decode() - Decode an x86 instruction * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @buf_len: length of the insn buffer at @kaddr * @m: insn mode, see enum insn_mode * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m) { int ret; /* #define INSN_MODE_KERN -1 __ignore_sync_check__ mode is only valid in the kernel */ if (m == INSN_MODE_KERN) insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64)); else insn_init(insn, kaddr, buf_len, m == INSN_MODE_64); ret = insn_get_length(insn); if (ret) return ret; if (insn_complete(insn)) return 0; return -EINVAL; } |
6 6 39 38 39 39 39 38 39 47 47 47 45 8 8 47 47 47 39 39 38 39 47 47 45 8 47 47 47 46 47 47 47 46 47 47 47 47 45 8 8 45 44 45 45 45 8 8 45 45 44 45 45 44 45 45 44 45 45 45 37 37 10 10 19 13 15 15 15 15 19 19 19 19 18 19 15 13 13 13 19 26 33 33 35 34 34 35 19 19 19 19 19 19 19 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2021 Intel Corporation * * utilities for mac80211 */ #include <net/mac80211.h> #include <linux/netdevice.h> #include <linux/export.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/bitmap.h> #include <linux/crc32.h> #include <net/net_namespace.h> #include <net/cfg80211.h> #include <net/rtnetlink.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "mesh.h" #include "wme.h" #include "led.h" #include "wep.h" /* privid for wiphys to determine whether they belong to us or not */ const void *const mac80211_wiphy_privid = &mac80211_wiphy_privid; struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) { struct ieee80211_local *local; local = wiphy_priv(wiphy); return &local->hw; } EXPORT_SYMBOL(wiphy_to_ieee80211_hw); u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type) { __le16 fc = hdr->frame_control; if (ieee80211_is_data(fc)) { if (len < 24) /* drop incorrect hdr len (data) */ return NULL; if (ieee80211_has_a4(fc)) return NULL; if (ieee80211_has_tods(fc)) return hdr->addr1; if (ieee80211_has_fromds(fc)) return hdr->addr2; return hdr->addr3; } if (ieee80211_is_s1g_beacon(fc)) { struct ieee80211_ext *ext = (void *) hdr; return ext->u.s1g_beacon.sa; } if (ieee80211_is_mgmt(fc)) { if (len < 24) /* drop incorrect hdr len (mgmt) */ return NULL; return hdr->addr3; } if (ieee80211_is_ctl(fc)) { if (ieee80211_is_pspoll(fc)) return hdr->addr1; if (ieee80211_is_back_req(fc)) { switch (type) { case NL80211_IFTYPE_STATION: return hdr->addr2; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: return hdr->addr1; default: break; /* fall through to the return */ } } } return NULL; } EXPORT_SYMBOL(ieee80211_get_bssid); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_hdr *hdr; skb_queue_walk(&tx->skbs, skb) { hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); } } int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble, int shift) { int dur; /* calculate duration (in microseconds, rounded up to next higher * integer if it includes a fractional microsecond) to send frame of * len bytes (does not include FCS) at the given rate. Duration will * also include SIFS. * * rate is in 100 kbps, so divident is multiplied by 10 in the * DIV_ROUND_UP() operations. * * shift may be 2 for 5 MHz channels or 1 for 10 MHz channels, and * is assumed to be 0 otherwise. */ if (band == NL80211_BAND_5GHZ || erp) { /* * OFDM: * * N_DBPS = DATARATE x 4 * N_SYM = Ceiling((16+8xLENGTH+6) / N_DBPS) * (16 = SIGNAL time, 6 = tail bits) * TXTIME = T_PREAMBLE + T_SIGNAL + T_SYM x N_SYM + Signal Ext * * T_SYM = 4 usec * 802.11a - 18.5.2: aSIFSTime = 16 usec * 802.11g - 19.8.4: aSIFSTime = 10 usec + * signal ext = 6 usec */ dur = 16; /* SIFS + signal ext */ dur += 16; /* IEEE 802.11-2012 18.3.2.4: T_PREAMBLE = 16 usec */ dur += 4; /* IEEE 802.11-2012 18.3.2.4: T_SIGNAL = 4 usec */ /* IEEE 802.11-2012 18.3.2.4: all values above are: * * times 4 for 5 MHz * * times 2 for 10 MHz */ dur *= 1 << shift; /* rates should already consider the channel bandwidth, * don't apply divisor again. */ dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10, 4 * rate); /* T_SYM x N_SYM */ } else { /* * 802.11b or 802.11g with 802.11b compatibility: * 18.3.4: TXTIME = PreambleLength + PLCPHeaderTime + * Ceiling(((LENGTH+PBCC)x8)/DATARATE). PBCC=0. * * 802.11 (DS): 15.3.3, 802.11b: 18.3.4 * aSIFSTime = 10 usec * aPreambleLength = 144 usec or 72 usec with short preamble * aPLCPHeaderLength = 48 usec or 24 usec with short preamble */ dur = 10; /* aSIFSTime = 10 usec */ dur += short_preamble ? (72 + 24) : (144 + 48); dur += DIV_ROUND_UP(8 * (len + 4) * 10, rate); } return dur; } /* Exported duration function for driver use */ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_band band, size_t frame_len, struct ieee80211_rate *rate) { struct ieee80211_sub_if_data *sdata; u16 dur; int erp, shift = 0; bool short_preamble = false; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) erp = rate->flags & IEEE80211_RATE_ERP_G; shift = ieee80211_vif_get_shift(vif); } dur = ieee80211_frame_duration(band, frame_len, rate->bitrate, erp, short_preamble, shift); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_generic_frame_duration); __le16 ieee80211_rts_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; struct ieee80211_sub_if_data *sdata; bool short_preamble; int erp, shift = 0, bitrate; u16 dur; struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[frame_txctl->band]; short_preamble = false; rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) erp = rate->flags & IEEE80211_RATE_ERP_G; shift = ieee80211_vif_get_shift(vif); } bitrate = DIV_ROUND_UP(rate->bitrate, 1 << shift); /* CTS duration */ dur = ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble, shift); /* Data frame duration */ dur += ieee80211_frame_duration(sband->band, frame_len, bitrate, erp, short_preamble, shift); /* ACK duration */ dur += ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble, shift); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_rts_duration); __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; struct ieee80211_sub_if_data *sdata; bool short_preamble; int erp, shift = 0, bitrate; u16 dur; struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[frame_txctl->band]; short_preamble = false; rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) erp = rate->flags & IEEE80211_RATE_ERP_G; shift = ieee80211_vif_get_shift(vif); } bitrate = DIV_ROUND_UP(rate->bitrate, 1 << shift); /* Data frame duration */ dur = ieee80211_frame_duration(sband->band, frame_len, bitrate, erp, short_preamble, shift); if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) { /* ACK duration */ dur += ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble, shift); } return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_ctstoself_duration); static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) { struct ieee80211_local *local = sdata->local; struct ieee80211_vif *vif = &sdata->vif; struct fq *fq = &local->fq; struct ps_data *ps = NULL; struct txq_info *txqi; struct sta_info *sta; int i; local_bh_disable(); spin_lock(&fq->lock); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; sdata->vif.txqs_stopped[ac] = false; list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct ieee80211_txq *txq = sta->sta.txq[i]; if (!txq) continue; txqi = to_txq_info(txq); if (ac != txq->ac) continue; if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags)) continue; spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); spin_lock(&fq->lock); } } if (!vif->txq) goto out; txqi = to_txq_info(vif->txq); if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags) || (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac) goto out; spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); local_bh_enable(); return; out: spin_unlock(&fq->lock); local_bh_enable(); } static void __releases(&local->queue_stop_reason_lock) __acquires(&local->queue_stop_reason_lock) _ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags) { struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; int i; rcu_read_lock(); if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; for (i = 0; i < local->hw.queues; i++) { if (local->queue_stop_reasons[i]) continue; spin_unlock_irqrestore(&local->queue_stop_reason_lock, *flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { int ac; for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; if (ac_queue == i || sdata->vif.cab_queue == i) __ieee80211_wake_txqs(sdata, ac); } } spin_lock_irqsave(&local->queue_stop_reason_lock, *flags); } rcu_read_unlock(); } void ieee80211_wake_txqs(struct tasklet_struct *t) { struct ieee80211_local *local = from_tasklet(local, t, wake_txqs_tasklet); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); _ieee80211_wake_txqs(local, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) { struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; if (local->ops->wake_tx_queue) return; if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; list_for_each_entry_rcu(sdata, &local->interfaces, list) { int ac; if (!sdata->dev) continue; if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE && local->queue_stop_reasons[sdata->vif.cab_queue] != 0) continue; for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; if (ac_queue == queue || (sdata->vif.cab_queue == queue && local->queue_stop_reasons[ac_queue] == 0 && skb_queue_empty(&local->pending[ac_queue]))) netif_wake_subqueue(sdata->dev, ac); } } } static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted, unsigned long *flags) { struct ieee80211_local *local = hw_to_local(hw); trace_wake_queue(local, queue, reason); if (WARN_ON(queue >= hw->queues)) return; if (!test_bit(reason, &local->queue_stop_reasons[queue])) return; if (!refcounted) { local->q_stop_reasons[queue][reason] = 0; } else { local->q_stop_reasons[queue][reason]--; if (WARN_ON(local->q_stop_reasons[queue][reason] < 0)) local->q_stop_reasons[queue][reason] = 0; } if (local->q_stop_reasons[queue][reason] == 0) __clear_bit(reason, &local->queue_stop_reasons[queue]); if (local->queue_stop_reasons[queue] != 0) /* someone still has this queue stopped */ return; if (skb_queue_empty(&local->pending[queue])) { rcu_read_lock(); ieee80211_propagate_queue_wake(local, queue); rcu_read_unlock(); } else tasklet_schedule(&local->tx_pending_tasklet); /* * Calling _ieee80211_wake_txqs here can be a problem because it may * release queue_stop_reason_lock which has been taken by * __ieee80211_wake_queue's caller. It is certainly not very nice to * release someone's lock, but it is fine because all the callers of * __ieee80211_wake_queue call it right before releasing the lock. */ if (local->ops->wake_tx_queue) { if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER) tasklet_schedule(&local->wake_txqs_tasklet); else _ieee80211_wake_txqs(local, flags); } } void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_wake_queue(hw, queue, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue) { ieee80211_wake_queue_by_reason(hw, queue, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_wake_queue); static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; trace_stop_queue(local, queue, reason); if (WARN_ON(queue >= hw->queues)) return; if (!refcounted) local->q_stop_reasons[queue][reason] = 1; else local->q_stop_reasons[queue][reason]++; if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue])) return; if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { int ac; if (!sdata->dev) continue; for (ac = 0; ac < n_acs; ac++) { if (sdata->vif.hw_queue[ac] == queue || sdata->vif.cab_queue == queue) { if (!local->ops->wake_tx_queue) { netif_stop_subqueue(sdata->dev, ac); continue; } spin_lock(&local->fq.lock); sdata->vif.txqs_stopped[ac] = true; spin_unlock(&local->fq.lock); } } } rcu_read_unlock(); } void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_stop_queue(hw, queue, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue) { ieee80211_stop_queue_by_reason(hw, queue, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_stop_queue); void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_hw *hw = &local->hw; unsigned long flags; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int queue = info->hw_queue; if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); return; } spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false); __skb_queue_tail(&local->pending[queue], skb); __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_add_pending_skbs(struct ieee80211_local *local, struct sk_buff_head *skbs) { struct ieee80211_hw *hw = &local->hw; struct sk_buff *skb; unsigned long flags; int queue, i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); while ((skb = skb_dequeue(skbs))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); continue; } queue = info->hw_queue; __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false); __skb_queue_tail(&local->pending[queue], skb); } for (i = 0; i < hw->queues; i++) __ieee80211_wake_queue(hw, i, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) __ieee80211_stop_queue(hw, i, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues(struct ieee80211_hw *hw) { ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_stop_queues); int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int ret; if (WARN_ON(queue >= hw->queues)) return true; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); ret = test_bit(IEEE80211_QUEUE_STOP_REASON_DRIVER, &local->queue_stop_reasons[queue]); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return ret; } EXPORT_SYMBOL(ieee80211_queue_stopped); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) __ieee80211_wake_queue(hw, i, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queues(struct ieee80211_hw *hw) { ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_wake_queues); static unsigned int ieee80211_get_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { unsigned int queues; if (sdata && ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { int ac; queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) queues |= BIT(sdata->vif.hw_queue[ac]); if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE) queues |= BIT(sdata->vif.cab_queue); } else { /* all queues */ queues = BIT(local->hw.queues) - 1; } return queues; } void __ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int queues, bool drop) { if (!local->ops->flush) return; /* * If no queue was set, or if the HW doesn't support * IEEE80211_HW_QUEUE_CONTROL - flush all queues */ if (!queues || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) queues = ieee80211_get_vif_queues(local, sdata); ieee80211_stop_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, false); drv_flush(local, sdata, queues, drop); ieee80211_wake_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, false); } void ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool drop) { __ieee80211_flush_queues(local, sdata, 0, drop); } void ieee80211_stop_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason) { ieee80211_stop_queues_by_reason(&local->hw, ieee80211_get_vif_queues(local, sdata), reason, true); } void ieee80211_wake_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum queue_stop_reason reason) { ieee80211_wake_queues_by_reason(&local->hw, ieee80211_get_vif_queues(local, sdata), reason, true); } static void __iterate_interfaces(struct ieee80211_local *local, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_sub_if_data *sdata; bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE; list_for_each_entry_rcu(sdata, &local->interfaces, list) { switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) continue; break; case NL80211_IFTYPE_AP_VLAN: continue; default: break; } if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) && active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if (ieee80211_sdata_running(sdata) || !active_only) iterator(data, sdata->vif.addr, &sdata->vif); } sdata = rcu_dereference_check(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)); if (sdata && (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || sdata->flags & IEEE80211_SDATA_IN_DRIVER)) iterator(data, sdata->vif.addr, &sdata->vif); } void ieee80211_iterate_interfaces( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); mutex_lock(&local->iflist_mtx); __iterate_interfaces(local, iter_flags, iterator, data); mutex_unlock(&local->iflist_mtx); } EXPORT_SYMBOL_GPL(ieee80211_iterate_interfaces); void ieee80211_iterate_active_interfaces_atomic( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); void ieee80211_iterate_active_interfaces_mtx( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); lockdep_assert_wiphy(hw->wiphy); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_mtx); static void __iterate_stations(struct ieee80211_local *local, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct sta_info *sta; list_for_each_entry_rcu(sta, &local->sta_list, list) { if (!sta->uploaded) continue; iterator(data, &sta->sta); } } void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); __iterate_stations(local, iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic); struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (!ieee80211_sdata_running(sdata) || !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) return NULL; return &sdata->vif; } EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif) { if (!vif) return NULL; return &vif_to_sdata(vif)->wdev; } EXPORT_SYMBOL_GPL(ieee80211_vif_to_wdev); /* * Nothing should have been stuffed into the workqueue during * the suspend->resume cycle. Since we can't check each caller * of this function if we are already quiescing / suspended, * check here and don't WARN since this can actually happen when * the rx path (for example) is racing against __ieee80211_suspend * and suspending / quiescing was set after the rx path checked * them. */ static bool ieee80211_can_queue_work(struct ieee80211_local *local) { if (local->quiescing || (local->suspended && !local->resuming)) { pr_warn("queueing ieee80211 work while going to suspend\n"); return false; } return true; } void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work) { struct ieee80211_local *local = hw_to_local(hw); if (!ieee80211_can_queue_work(local)) return; queue_work(local->workqueue, work); } EXPORT_SYMBOL(ieee80211_queue_work); void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, struct delayed_work *dwork, unsigned long delay) { struct ieee80211_local *local = hw_to_local(hw); if (!ieee80211_can_queue_work(local)) return; queue_delayed_work(local->workqueue, dwork, delay); } EXPORT_SYMBOL(ieee80211_queue_delayed_work); static void ieee80211_parse_extension_element(u32 *crc, const struct element *elem, struct ieee802_11_elems *elems) { const void *data = elem->data + 1; u8 len; if (!elem->datalen) return; len = elem->datalen - 1; switch (elem->data[0]) { case WLAN_EID_EXT_HE_MU_EDCA: if (len >= sizeof(*elems->mu_edca_param_set)) { elems->mu_edca_param_set = data; if (crc) *crc = crc32_be(*crc, (void *)elem, elem->datalen + 2); } break; case WLAN_EID_EXT_HE_CAPABILITY: elems->he_cap = data; elems->he_cap_len = len; break; case WLAN_EID_EXT_HE_OPERATION: if (len >= sizeof(*elems->he_operation) && len >= ieee80211_he_oper_size(data) - 1) { if (crc) *crc = crc32_be(*crc, (void *)elem, elem->datalen + 2); elems->he_operation = data; } break; case WLAN_EID_EXT_UORA: if (len >= 1) elems->uora_element = data; break; case WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME: if (len == 3) elems->max_channel_switch_time = data; break; case WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION: if (len >= sizeof(*elems->mbssid_config_ie)) elems->mbssid_config_ie = data; break; case WLAN_EID_EXT_HE_SPR: if (len >= sizeof(*elems->he_spr) && len >= ieee80211_he_spr_size(data)) elems->he_spr = data; break; case WLAN_EID_EXT_HE_6GHZ_CAPA: if (len >= sizeof(*elems->he_6ghz_capa)) elems->he_6ghz_capa = data; break; } } static u32 _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, struct ieee802_11_elems *elems, u64 filter, u32 crc, const struct element *check_inherit) { const struct element *elem; bool calc_crc = filter != 0; DECLARE_BITMAP(seen_elems, 256); const u8 *ie; bitmap_zero(seen_elems, 256); for_each_element(elem, start, len) { bool elem_parse_failed; u8 id = elem->id; u8 elen = elem->datalen; const u8 *pos = elem->data; if (check_inherit && !cfg80211_is_element_inherited(elem, check_inherit)) continue; switch (id) { case WLAN_EID_SSID: case WLAN_EID_SUPP_RATES: case WLAN_EID_FH_PARAMS: case WLAN_EID_DS_PARAMS: case WLAN_EID_CF_PARAMS: case WLAN_EID_TIM: case WLAN_EID_IBSS_PARAMS: case WLAN_EID_CHALLENGE: case WLAN_EID_RSN: case WLAN_EID_ERP_INFO: case WLAN_EID_EXT_SUPP_RATES: case WLAN_EID_HT_CAPABILITY: case WLAN_EID_HT_OPERATION: case WLAN_EID_VHT_CAPABILITY: case WLAN_EID_VHT_OPERATION: case WLAN_EID_MESH_ID: case WLAN_EID_MESH_CONFIG: case WLAN_EID_PEER_MGMT: case WLAN_EID_PREQ: case WLAN_EID_PREP: case WLAN_EID_PERR: case WLAN_EID_RANN: case WLAN_EID_CHANNEL_SWITCH: case WLAN_EID_EXT_CHANSWITCH_ANN: case WLAN_EID_COUNTRY: case WLAN_EID_PWR_CONSTRAINT: case WLAN_EID_TIMEOUT_INTERVAL: case WLAN_EID_SECONDARY_CHANNEL_OFFSET: case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: case WLAN_EID_CHAN_SWITCH_PARAM: case WLAN_EID_EXT_CAPABILITY: case WLAN_EID_CHAN_SWITCH_TIMING: case WLAN_EID_LINK_ID: case WLAN_EID_BSS_MAX_IDLE_PERIOD: case WLAN_EID_RSNX: case WLAN_EID_S1G_BCN_COMPAT: case WLAN_EID_S1G_CAPABILITIES: case WLAN_EID_S1G_OPERATION: case WLAN_EID_AID_RESPONSE: case WLAN_EID_S1G_SHORT_BCN_INTERVAL: /* * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible * that if the content gets bigger it might be needed more than once */ if (test_bit(id, seen_elems)) { elems->parse_error = true; continue; } break; } if (calc_crc && id < 64 && (filter & (1ULL << id))) crc = crc32_be(crc, pos - 2, elen + 2); elem_parse_failed = false; switch (id) { case WLAN_EID_LINK_ID: if (elen + 2 < sizeof(struct ieee80211_tdls_lnkie)) { elem_parse_failed = true; break; } elems->lnk_id = (void *)(pos - 2); break; case WLAN_EID_CHAN_SWITCH_TIMING: if (elen < sizeof(struct ieee80211_ch_switch_timing)) { elem_parse_failed = true; break; } elems->ch_sw_timing = (void *)pos; break; case WLAN_EID_EXT_CAPABILITY: elems->ext_capab = pos; elems->ext_capab_len = elen; break; case WLAN_EID_SSID: elems->ssid = pos; elems->ssid_len = elen; break; case WLAN_EID_SUPP_RATES: elems->supp_rates = pos; elems->supp_rates_len = elen; break; case WLAN_EID_DS_PARAMS: if (elen >= 1) elems->ds_params = pos; else elem_parse_failed = true; break; case WLAN_EID_TIM: if (elen >= sizeof(struct ieee80211_tim_ie)) { elems->tim = (void *)pos; elems->tim_len = elen; } else elem_parse_failed = true; break; case WLAN_EID_VENDOR_SPECIFIC: if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 && pos[2] == 0xf2) { /* Microsoft OUI (00:50:F2) */ if (calc_crc) crc = crc32_be(crc, pos - 2, elen + 2); if (elen >= 5 && pos[3] == 2) { /* OUI Type 2 - WMM IE */ if (pos[4] == 0) { elems->wmm_info = pos; elems->wmm_info_len = elen; } else if (pos[4] == 1) { elems->wmm_param = pos; elems->wmm_param_len = elen; } } } break; case WLAN_EID_RSN: elems->rsn = pos; elems->rsn_len = elen; break; case WLAN_EID_ERP_INFO: if (elen >= 1) elems->erp_info = pos; else elem_parse_failed = true; break; case WLAN_EID_EXT_SUPP_RATES: elems->ext_supp_rates = pos; elems->ext_supp_rates_len = elen; break; case WLAN_EID_HT_CAPABILITY: if (elen >= sizeof(struct ieee80211_ht_cap)) elems->ht_cap_elem = (void *)pos; else elem_parse_failed = true; break; case WLAN_EID_HT_OPERATION: if (elen >= sizeof(struct ieee80211_ht_operation)) elems->ht_operation = (void *)pos; else elem_parse_failed = true; break; case WLAN_EID_VHT_CAPABILITY: if (elen >= sizeof(struct ieee80211_vht_cap)) elems->vht_cap_elem = (void *)pos; else elem_parse_failed = true; break; case WLAN_EID_VHT_OPERATION: if (elen >= sizeof(struct ieee80211_vht_operation)) { elems->vht_operation = (void *)pos; if (calc_crc) crc = crc32_be(crc, pos - 2, elen + 2); break; } elem_parse_failed = true; break; case WLAN_EID_OPMODE_NOTIF: if (elen > 0) { elems->opmode_notif = pos; if (calc_crc) crc = crc32_be(crc, pos - 2, elen + 2); break; } elem_parse_failed = true; break; case WLAN_EID_MESH_ID: elems->mesh_id = pos; elems->mesh_id_len = elen; break; case WLAN_EID_MESH_CONFIG: if (elen >= sizeof(struct ieee80211_meshconf_ie)) elems->mesh_config = (void *)pos; else elem_parse_failed = true; break; case WLAN_EID_PEER_MGMT: elems->peering = pos; elems->peering_len = elen; break; case WLAN_EID_MESH_AWAKE_WINDOW: if (elen >= 2) elems->awake_window = (void *)pos; break; case WLAN_EID_PREQ: elems->preq = pos; elems->preq_len = elen; break; case WLAN_EID_PREP: elems->prep = pos; elems->prep_len = elen; break; case WLAN_EID_PERR: elems->perr = pos; elems->perr_len = elen; break; case WLAN_EID_RANN: if (elen >= sizeof(struct ieee80211_rann_ie)) elems->rann = (void *)pos; else elem_parse_failed = true; break; case WLAN_EID_CHANNEL_SWITCH: if (elen != sizeof(struct ieee80211_channel_sw_ie)) { elem_parse_failed = true; break; } elems->ch_switch_ie = (void *)pos; break; case WLAN_EID_EXT_CHANSWITCH_ANN: if (elen != sizeof(struct ieee80211_ext_chansw_ie)) { elem_parse_failed = true; break; } elems->ext_chansw_ie = (void *)pos; break; case WLAN_EID_SECONDARY_CHANNEL_OFFSET: if (elen != sizeof(struct ieee80211_sec_chan_offs_ie)) { elem_parse_failed = true; break; } elems->sec_chan_offs = (void *)pos; break; case WLAN_EID_CHAN_SWITCH_PARAM: if (elen < sizeof(*elems->mesh_chansw_params_ie)) { elem_parse_failed = true; break; } elems->mesh_chansw_params_ie = (void *)pos; break; case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: if (!action || elen < sizeof(*elems->wide_bw_chansw_ie)) { elem_parse_failed = true; break; } elems->wide_bw_chansw_ie = (void *)pos; break; case WLAN_EID_CHANNEL_SWITCH_WRAPPER: if (action) { elem_parse_failed = true; break; } /* * This is a bit tricky, but as we only care about * the wide bandwidth channel switch element, so * just parse it out manually. */ ie = cfg80211_find_ie(WLAN_EID_WIDE_BW_CHANNEL_SWITCH, pos, elen); if (ie) { if (ie[1] >= sizeof(*elems->wide_bw_chansw_ie)) elems->wide_bw_chansw_ie = (void *)(ie + 2); else elem_parse_failed = true; } break; case WLAN_EID_COUNTRY: elems->country_elem = pos; elems->country_elem_len = elen; break; case WLAN_EID_PWR_CONSTRAINT: if (elen != 1) { elem_parse_failed = true; break; } elems->pwr_constr_elem = pos; break; case WLAN_EID_CISCO_VENDOR_SPECIFIC: /* Lots of different options exist, but we only care * about the Dynamic Transmit Power Control element. * First check for the Cisco OUI, then for the DTPC * tag (0x00). */ if (elen < 4) { elem_parse_failed = true; break; } if (pos[0] != 0x00 || pos[1] != 0x40 || pos[2] != 0x96 || pos[3] != 0x00) break; if (elen != 6) { elem_parse_failed = true; break; } if (calc_crc) crc = crc32_be(crc, pos - 2, elen + 2); elems->cisco_dtpc_elem = pos; break; case WLAN_EID_ADDBA_EXT: if (elen < sizeof(struct ieee80211_addba_ext_ie)) { elem_parse_failed = true; break; } elems->addba_ext_ie = (void *)pos; break; case WLAN_EID_TIMEOUT_INTERVAL: if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) elems->timeout_int = (void *)pos; else elem_parse_failed = true; break; case WLAN_EID_BSS_MAX_IDLE_PERIOD: if (elen >= sizeof(*elems->max_idle_period_ie)) elems->max_idle_period_ie = (void *)pos; break; case WLAN_EID_RSNX: elems->rsnx = pos; elems->rsnx_len = elen; break; case WLAN_EID_TX_POWER_ENVELOPE: if (elen < 1 || elen > sizeof(struct ieee80211_tx_pwr_env)) break; if (elems->tx_pwr_env_num >= ARRAY_SIZE(elems->tx_pwr_env)) break; elems->tx_pwr_env[elems->tx_pwr_env_num] = (void *)pos; elems->tx_pwr_env_len[elems->tx_pwr_env_num] = elen; elems->tx_pwr_env_num++; break; case WLAN_EID_EXTENSION: ieee80211_parse_extension_element(calc_crc ? &crc : NULL, elem, elems); break; case WLAN_EID_S1G_CAPABILITIES: if (elen >= sizeof(*elems->s1g_capab)) elems->s1g_capab = (void *)pos; else elem_parse_failed = true; break; case WLAN_EID_S1G_OPERATION: if (elen == sizeof(*elems->s1g_oper)) elems->s1g_oper = (void *)pos; else elem_parse_failed = true; break; case WLAN_EID_S1G_BCN_COMPAT: if (elen == sizeof(*elems->s1g_bcn_compat)) elems->s1g_bcn_compat = (void *)pos; else elem_parse_failed = true; break; case WLAN_EID_AID_RESPONSE: if (elen == sizeof(struct ieee80211_aid_response_ie)) elems->aid_resp = (void *)pos; else elem_parse_failed = true; break; default: break; } if (elem_parse_failed) elems->parse_error = true; else __set_bit(id, seen_elems); } if (!for_each_element_completed(elem, start, len)) elems->parse_error = true; return crc; } static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, struct ieee802_11_elems *elems, const u8 *transmitter_bssid, const u8 *bss_bssid, u8 *nontransmitted_profile) { const struct element *elem, *sub; size_t profile_len = 0; bool found = false; if (!bss_bssid || !transmitter_bssid) return profile_len; for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, start, len) { if (elem->datalen < 2) continue; if (elem->data[0] < 1 || elem->data[0] > 8) continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 new_bssid[ETH_ALEN]; const u8 *index; if (sub->id != 0 || sub->datalen < 4) { /* not a valid BSS profile */ continue; } if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP || sub->data[1] != 2) { /* The first element of the * Nontransmitted BSSID Profile is not * the Nontransmitted BSSID Capability * element. */ continue; } memset(nontransmitted_profile, 0, len); profile_len = cfg80211_merge_profile(start, len, elem, sub, nontransmitted_profile, len); /* found a Nontransmitted BSSID Profile */ index = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, nontransmitted_profile, profile_len); if (!index || index[1] < 1 || index[2] == 0) { /* Invalid MBSSID Index element */ continue; } cfg80211_gen_new_bssid(transmitter_bssid, elem->data[0], index[2], new_bssid); if (ether_addr_equal(new_bssid, bss_bssid)) { found = true; elems->bssid_index_len = index[1]; elems->bssid_index = (void *)&index[2]; break; } } } return found ? profile_len : 0; } struct ieee802_11_elems *ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, u64 filter, u32 crc, const u8 *transmitter_bssid, const u8 *bss_bssid) { struct ieee802_11_elems *elems; const struct element *non_inherit = NULL; u8 *nontransmitted_profile; int nontransmitted_profile_len = 0; elems = kzalloc(sizeof(*elems) + len, GFP_ATOMIC); if (!elems) return NULL; elems->ie_start = start; elems->total_len = len; elems->scratch_len = len; elems->scratch_pos = elems->scratch; nontransmitted_profile = elems->scratch_pos; nontransmitted_profile_len = ieee802_11_find_bssid_profile(start, len, elems, transmitter_bssid, bss_bssid, nontransmitted_profile); non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, nontransmitted_profile, nontransmitted_profile_len); crc = _ieee802_11_parse_elems_crc(start, len, action, elems, filter, crc, non_inherit); /* Override with nontransmitted profile, if found */ if (nontransmitted_profile_len) _ieee802_11_parse_elems_crc(nontransmitted_profile, nontransmitted_profile_len, action, elems, 0, 0, NULL); if (elems->tim && !elems->parse_error) { const struct ieee80211_tim_ie *tim_ie = elems->tim; elems->dtim_period = tim_ie->dtim_period; elems->dtim_count = tim_ie->dtim_count; } /* Override DTIM period and count if needed */ if (elems->bssid_index && elems->bssid_index_len >= offsetofend(struct ieee80211_bssid_index, dtim_period)) elems->dtim_period = elems->bssid_index->dtim_period; if (elems->bssid_index && elems->bssid_index_len >= offsetofend(struct ieee80211_bssid_index, dtim_count)) elems->dtim_count = elems->bssid_index->dtim_count; elems->crc = crc; return elems; } void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_queue_params *qparam, int ac) { struct ieee80211_chanctx_conf *chanctx_conf; const struct ieee80211_reg_rule *rrule; const struct ieee80211_wmm_ac *wmm_ac; u16 center_freq = 0; if (sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_STATION) return; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (chanctx_conf) center_freq = chanctx_conf->def.chan->center_freq; if (!center_freq) { rcu_read_unlock(); return; } rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq)); if (IS_ERR_OR_NULL(rrule) || !rrule->has_wmm) { rcu_read_unlock(); return; } if (sdata->vif.type == NL80211_IFTYPE_AP) wmm_ac = &rrule->wmm_rule.ap[ac]; else wmm_ac = &rrule->wmm_rule.client[ac]; qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min); qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max); qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn); qparam->txop = min_t(u16, qparam->txop, wmm_ac->cot / 32); rcu_read_unlock(); } void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_queue_params qparam; struct ieee80211_chanctx_conf *chanctx_conf; int ac; bool use_11b; bool is_ocb; /* Use another EDCA parameters if dot11OCBActivated=true */ int aCWmin, aCWmax; if (!local->ops->conf_tx) return; if (local->hw.queues < IEEE80211_NUM_ACS) return; memset(&qparam, 0, sizeof(qparam)); rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); use_11b = (chanctx_conf && chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) && !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE); rcu_read_unlock(); is_ocb = (sdata->vif.type == NL80211_IFTYPE_OCB); /* Set defaults according to 802.11-2007 Table 7-37 */ aCWmax = 1023; if (use_11b) aCWmin = 31; else aCWmin = 15; /* Confiure old 802.11b/g medium access rules. */ qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; qparam.aifs = 2; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { /* Update if QoS is enabled. */ if (enable_qos) { switch (ac) { case IEEE80211_AC_BK: qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; if (is_ocb) qparam.aifs = 9; else qparam.aifs = 7; break; /* never happens but let's not leave undefined */ default: case IEEE80211_AC_BE: qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; if (is_ocb) qparam.aifs = 6; else qparam.aifs = 3; break; case IEEE80211_AC_VI: qparam.cw_max = aCWmin; qparam.cw_min = (aCWmin + 1) / 2 - 1; if (is_ocb) qparam.txop = 0; else if (use_11b) qparam.txop = 6016/32; else qparam.txop = 3008/32; if (is_ocb) qparam.aifs = 3; else qparam.aifs = 2; break; case IEEE80211_AC_VO: qparam.cw_max = (aCWmin + 1) / 2 - 1; qparam.cw_min = (aCWmin + 1) / 4 - 1; if (is_ocb) qparam.txop = 0; else if (use_11b) qparam.txop = 3264/32; else qparam.txop = 1504/32; qparam.aifs = 2; break; } } ieee80211_regulatory_limit_wmm_params(sdata, &qparam, ac); qparam.uapsd = false; sdata->tx_conf[ac] = qparam; drv_conf_tx(local, sdata, ac, &qparam); } if (sdata->vif.type != NL80211_IFTYPE_MONITOR && sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) { sdata->vif.bss_conf.qos = enable_qos; if (bss_notify) ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS); } } void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *da, const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; int err; /* 24 + 6 = header + auth_algo + auth_transaction + status_code */ skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN + 24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN); mgmt = skb_put_zero(skb, 24 + 6); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_AUTH); memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg); mgmt->u.auth.auth_transaction = cpu_to_le16(transaction); mgmt->u.auth.status_code = cpu_to_le16(status); if (extra) skb_put_data(skb, extra, extra_len); if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) { mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx); if (WARN_ON(err)) { kfree_skb(skb); return; } } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | tx_flags; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt = (void *)frame_buf; /* build frame */ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype); mgmt->duration = 0; /* initialize only */ mgmt->seq_ctrl = 0; /* initialize only */ memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); /* u.deauth.reason_code == u.disassoc.reason_code */ mgmt->u.deauth.reason_code = cpu_to_le16(reason); if (send_frame) { skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_DEAUTH_FRAME_LEN); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); /* copy in frame */ skb_put_data(skb, mgmt, IEEE80211_DEAUTH_FRAME_LEN); if (sdata->vif.type != NL80211_IFTYPE_STATION || !(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } } static u8 *ieee80211_write_he_6ghz_cap(u8 *pos, __le16 cap, u8 *end) { if ((end - pos) < 5) return pos; *pos++ = WLAN_EID_EXTENSION; *pos++ = 1 + sizeof(cap); *pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA; memcpy(pos, &cap, sizeof(cap)); return pos + 2; } static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, const u8 *ie, size_t ie_len, enum nl80211_band band, u32 rate_mask, struct cfg80211_chan_def *chandef, size_t *offset, u32 flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; const struct ieee80211_sta_he_cap *he_cap; u8 *pos = buffer, *end = buffer + buffer_len; size_t noffset; int supp_rates_len, i; u8 rates[32]; int num_rates; int ext_rates_len; int shift; u32 rate_flags; bool have_80mhz = false; *offset = 0; sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband)) return 0; rate_flags = ieee80211_chandef_rate_flags(chandef); shift = ieee80211_chandef_get_shift(chandef); num_rates = 0; for (i = 0; i < sband->n_bitrates; i++) { if ((BIT(i) & rate_mask) == 0) continue; /* skip rate */ if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; rates[num_rates++] = (u8) DIV_ROUND_UP(sband->bitrates[i].bitrate, (1 << shift) * 5); } supp_rates_len = min_t(int, num_rates, 8); if (end - pos < 2 + supp_rates_len) goto out_err; *pos++ = WLAN_EID_SUPP_RATES; *pos++ = supp_rates_len; memcpy(pos, rates, supp_rates_len); pos += supp_rates_len; /* insert "request information" if in custom IEs */ if (ie && ie_len) { static const u8 before_extrates[] = { WLAN_EID_SSID, WLAN_EID_SUPP_RATES, WLAN_EID_REQUEST, }; noffset = ieee80211_ie_split(ie, ie_len, before_extrates, ARRAY_SIZE(before_extrates), *offset); if (end - pos < noffset - *offset) goto out_err; memcpy(pos, ie + *offset, noffset - *offset); pos += noffset - *offset; *offset = noffset; } ext_rates_len = num_rates - supp_rates_len; if (ext_rates_len > 0) { if (end - pos < 2 + ext_rates_len) goto out_err; *pos++ = WLAN_EID_EXT_SUPP_RATES; *pos++ = ext_rates_len; memcpy(pos, rates + supp_rates_len, ext_rates_len); pos += ext_rates_len; } if (chandef->chan && sband->band == NL80211_BAND_2GHZ) { if (end - pos < 3) goto out_err; *pos++ = WLAN_EID_DS_PARAMS; *pos++ = 1; *pos++ = ieee80211_frequency_to_channel( chandef->chan->center_freq); } if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT) goto done; /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { /* * no need to list the ones split off already * (or generated here) */ WLAN_EID_DS_PARAMS, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, }; noffset = ieee80211_ie_split(ie, ie_len, before_ht, ARRAY_SIZE(before_ht), *offset); if (end - pos < noffset - *offset) goto out_err; memcpy(pos, ie + *offset, noffset - *offset); pos += noffset - *offset; *offset = noffset; } if (sband->ht_cap.ht_supported) { if (end - pos < 2 + sizeof(struct ieee80211_ht_cap)) goto out_err; pos = ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, sband->ht_cap.cap); } /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { /* * no need to list the ones split off already * (or generated here) */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_SSID_LIST, WLAN_EID_CHANNEL_USAGE, WLAN_EID_INTERWORKING, WLAN_EID_MESH_ID, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; noffset = ieee80211_ie_split(ie, ie_len, before_vht, ARRAY_SIZE(before_vht), *offset); if (end - pos < noffset - *offset) goto out_err; memcpy(pos, ie + *offset, noffset - *offset); pos += noffset - *offset; *offset = noffset; } /* Check if any channel in this sband supports at least 80 MHz */ for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (sband->vht_cap.vht_supported && have_80mhz) { if (end - pos < 2 + sizeof(struct ieee80211_vht_cap)) goto out_err; pos = ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, sband->vht_cap.cap); } /* insert custom IEs that go before HE */ if (ie && ie_len) { static const u8 before_he[] = { /* * no need to list the ones split off before VHT * or generated here */ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS, WLAN_EID_AP_CSN, /* TODO: add 11ah/11aj/11ak elements */ }; noffset = ieee80211_ie_split(ie, ie_len, before_he, ARRAY_SIZE(before_he), *offset); if (end - pos < noffset - *offset) goto out_err; memcpy(pos, ie + *offset, noffset - *offset); pos += noffset - *offset; *offset = noffset; } he_cap = ieee80211_get_he_iftype_cap(sband, ieee80211_vif_type_p2p(&sdata->vif)); if (he_cap && cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE)) { pos = ieee80211_ie_build_he_cap(0, pos, he_cap, end); if (!pos) goto out_err; } if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(NL80211_BAND_6GHZ), IEEE80211_CHAN_NO_HE)) { struct ieee80211_supported_band *sband6; sband6 = local->hw.wiphy->bands[NL80211_BAND_6GHZ]; he_cap = ieee80211_get_he_iftype_cap(sband6, ieee80211_vif_type_p2p(&sdata->vif)); if (he_cap) { enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); __le16 cap = ieee80211_get_he_6ghz_capa(sband, iftype); pos = ieee80211_write_he_6ghz_cap(pos, cap, end); } } /* * If adding more here, adjust code in main.c * that calculates local->scan_ies_len. */ return pos - buffer; out_err: WARN_ONCE(1, "not enough space for preq IEs\n"); done: return pos - buffer; } int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags) { size_t pos = 0, old_pos = 0, custom_ie_offset = 0; int i; memset(ie_desc, 0, sizeof(*ie_desc)); for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { pos += ieee80211_build_preq_ies_band(sdata, buffer + pos, buffer_len - pos, ie, ie_len, i, rate_masks[i], chandef, &custom_ie_offset, flags); ie_desc->ies[i] = buffer + old_pos; ie_desc->len[i] = pos - old_pos; old_pos = pos; } } /* add any remaining custom IEs */ if (ie && ie_len) { if (WARN_ONCE(buffer_len - pos < ie_len - custom_ie_offset, "not enough space for preq custom IEs\n")) return pos; memcpy(buffer + pos, ie + custom_ie_offset, ie_len - custom_ie_offset); ie_desc->common_ies = buffer + pos; ie_desc->common_ie_len = ie_len - custom_ie_offset; pos += ie_len - custom_ie_offset; } return pos; }; struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 flags) { struct ieee80211_local *local = sdata->local; struct cfg80211_chan_def chandef; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; int ies_len; u32 rate_masks[NUM_NL80211_BANDS] = {}; struct ieee80211_scan_ies dummy_ie_desc; /* * Do not send DS Channel parameter for directed probe requests * in order to maximize the chance that we get a response. Some * badly-behaved APs don't respond when this parameter is included. */ chandef.width = sdata->vif.bss_conf.chandef.width; if (flags & IEEE80211_PROBE_FLAG_DIRECTED) chandef.chan = NULL; else chandef.chan = chan; skb = ieee80211_probereq_get(&local->hw, src, ssid, ssid_len, 100 + ie_len); if (!skb) return NULL; rate_masks[chan->band] = ratemask; ies_len = ieee80211_build_preq_ies(sdata, skb_tail_pointer(skb), skb_tailroom(skb), &dummy_ie_desc, ie, ie_len, BIT(chan->band), rate_masks, &chandef, flags); skb_put(skb, ies_len); if (dst) { mgmt = (struct ieee80211_mgmt *) skb->data; memcpy(mgmt->da, dst, ETH_ALEN); memcpy(mgmt->bssid, dst, ETH_ALEN); } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; return skb; } u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates) { struct ieee80211_supported_band *sband; size_t num_rates; u32 supp_rates, rate_flags; int i, j, shift; sband = sdata->local->hw.wiphy->bands[band]; if (WARN_ON(!sband)) return 1; rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); shift = ieee80211_vif_get_shift(&sdata->vif); num_rates = sband->n_bitrates; supp_rates = 0; for (i = 0; i < elems->supp_rates_len + elems->ext_supp_rates_len; i++) { u8 rate = 0; int own_rate; bool is_basic; if (i < elems->supp_rates_len) rate = elems->supp_rates[i]; else if (elems->ext_supp_rates) rate = elems->ext_supp_rates [i - elems->supp_rates_len]; own_rate = 5 * (rate & 0x7f); is_basic = !!(rate & 0x80); if (is_basic && (rate & 0x7f) == BSS_MEMBERSHIP_SELECTOR_HT_PHY) continue; for (j = 0; j < num_rates; j++) { int brate; if ((rate_flags & sband->bitrates[j].flags) != rate_flags) continue; brate = DIV_ROUND_UP(sband->bitrates[j].bitrate, 1 << shift); if (brate == own_rate) { supp_rates |= BIT(j); if (basic_rates && is_basic) *basic_rates |= BIT(j); } } } return supp_rates; } void ieee80211_stop_device(struct ieee80211_local *local) { local_bh_disable(); ieee80211_handle_queued_frames(local); local_bh_enable(); ieee80211_led_radio(local, false); ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO); cancel_work_sync(&local->reconfig_filter); flush_workqueue(local->workqueue); drv_stop(local); } static void ieee80211_flush_completed_scan(struct ieee80211_local *local, bool aborted) { /* It's possible that we don't handle the scan completion in * time during suspend, so if it's still marked as completed * here, queue the work and flush it to clean things up. * Instead of calling the worker function directly here, we * really queue it to avoid potential races with other flows * scheduling the same work. */ if (test_bit(SCAN_COMPLETED, &local->scanning)) { /* If coming from reconfiguration failure, abort the scan so * we don't attempt to continue a partial HW scan - which is * possible otherwise if (e.g.) the 2.4 GHz portion was the * completed scan, and a 5 GHz portion is still pending. */ if (aborted) set_bit(SCAN_ABORTED, &local->scanning); ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); flush_delayed_work(&local->scan_work); } } static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; /* * We get here if during resume the device can't be restarted properly. * We might also get here if this happens during HW reset, which is a * slightly different situation and we need to drop all connections in * the latter case. * * Ask cfg80211 to turn off all interfaces, this will result in more * warnings but at least we'll then get into a clean stopped state. */ local->resuming = false; local->suspended = false; local->in_reconfig = false; ieee80211_flush_completed_scan(local, true); /* scheduled scan clearly can't be running any more, but tell * cfg80211 and clear local state */ ieee80211_sched_scan_end(local); list_for_each_entry(sdata, &local->interfaces, list) sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; /* Mark channel contexts as not being in the driver any more to avoid * removing them from the driver during the shutdown process... */ mutex_lock(&local->chanctx_mtx); list_for_each_entry(ctx, &local->chanctx_list, list) ctx->driver_present = false; mutex_unlock(&local->chanctx_mtx); } static void ieee80211_assign_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; if (!local->use_chanctx) return; mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (conf) { ctx = container_of(conf, struct ieee80211_chanctx, conf); drv_assign_vif_chanctx(local, sdata, ctx); } mutex_unlock(&local->chanctx_mtx); } static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; /* add STAs back */ mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { enum ieee80211_sta_state state; if (!sta->uploaded || sta->sdata != sdata) continue; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) WARN_ON(drv_sta_state(local, sta->sdata, sta, state, state + 1)); } mutex_unlock(&local->sta_mtx); } static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata) { struct cfg80211_nan_func *func, **funcs; int res, id, i = 0; res = drv_start_nan(sdata->local, sdata, &sdata->u.nan.conf); if (WARN_ON(res)) return res; funcs = kcalloc(sdata->local->hw.max_nan_de_entries + 1, sizeof(*funcs), GFP_KERNEL); if (!funcs) return -ENOMEM; /* Add all the functions: * This is a little bit ugly. We need to call a potentially sleeping * callback for each NAN function, so we can't hold the spinlock. */ spin_lock_bh(&sdata->u.nan.func_lock); idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) funcs[i++] = func; spin_unlock_bh(&sdata->u.nan.func_lock); for (i = 0; funcs[i]; i++) { res = drv_add_nan_func(sdata->local, sdata, funcs[i]); if (WARN_ON(res)) ieee80211_nan_func_terminated(&sdata->vif, funcs[i]->instance_id, NL80211_NAN_FUNC_TERM_REASON_ERROR, GFP_KERNEL); } kfree(funcs); return 0; } int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; struct sta_info *sta; int res, i; bool reconfig_due_to_wowlan = false; struct ieee80211_sub_if_data *sched_scan_sdata; struct cfg80211_sched_scan_request *sched_scan_req; bool sched_scan_stopped = false; bool suspended = local->suspended; bool in_reconfig = false; /* nothing to do if HW shouldn't run */ if (!local->open_count) goto wake_up; #ifdef CONFIG_PM if (suspended) local->resuming = true; if (local->wowlan) { /* * In the wowlan case, both mac80211 and the device * are functional when the resume op is called, so * clear local->suspended so the device could operate * normally (e.g. pass rx frames). */ local->suspended = false; res = drv_resume(local); local->wowlan = false; if (res < 0) { local->resuming = false; return res; } if (res == 0) goto wake_up; WARN_ON(res > 1); /* * res is 1, which means the driver requested * to go through a regular reset on wakeup. * restore local->suspended in this case. */ reconfig_due_to_wowlan = true; local->suspended = true; } #endif /* * In case of hw_restart during suspend (without wowlan), * cancel restart work, as we are reconfiguring the device * anyway. * Note that restart_work is scheduled on a frozen workqueue, * so we can't deadlock in this case. */ if (suspended && local->in_reconfig && !reconfig_due_to_wowlan) cancel_work_sync(&local->restart_work); local->started = false; /* * Upon resume hardware can sometimes be goofy due to * various platform / driver / bus issues, so restarting * the device may at times not work immediately. Propagate * the error. */ res = drv_start(local); if (res) { if (suspended) WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n"); else WARN(1, "Hardware became unavailable during restart.\n"); ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); ieee80211_handle_reconfig_failure(local); return res; } /* setup fragmentation threshold */ drv_set_frag_threshold(local, hw->wiphy->frag_threshold); /* setup RTS threshold */ drv_set_rts_threshold(local, hw->wiphy->rts_threshold); /* reset coverage class */ drv_set_coverage_class(local, hw->wiphy->coverage_class); ieee80211_led_radio(local, true); ieee80211_mod_tpt_led_trig(local, IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); /* add interfaces */ sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata) { /* in HW restart it exists already */ WARN_ON(local->resuming); res = drv_add_interface(local, sdata); if (WARN_ON(res)) { RCU_INIT_POINTER(local->monitor_sdata, NULL); synchronize_net(); kfree(sdata); } } list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && sdata->vif.type != NL80211_IFTYPE_MONITOR && ieee80211_sdata_running(sdata)) { res = drv_add_interface(local, sdata); if (WARN_ON(res)) break; } } /* If adding any of the interfaces failed above, roll back and * report failure. */ if (res) { list_for_each_entry_continue_reverse(sdata, &local->interfaces, list) if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && sdata->vif.type != NL80211_IFTYPE_MONITOR && ieee80211_sdata_running(sdata)) drv_remove_interface(local, sdata); ieee80211_handle_reconfig_failure(local); return res; } /* add channel contexts */ if (local->use_chanctx) { mutex_lock(&local->chanctx_mtx); list_for_each_entry(ctx, &local->chanctx_list, list) if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) WARN_ON(drv_add_chanctx(local, ctx)); mutex_unlock(&local->chanctx_mtx); sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_sdata_running(sdata)) ieee80211_assign_chanctx(local, sdata); } /* reconfigure hardware */ ieee80211_hw_config(local, ~0); ieee80211_configure_filter(local); /* Finally also reconfigure all the BSS information */ list_for_each_entry(sdata, &local->interfaces, list) { u32 changed; if (!ieee80211_sdata_running(sdata)) continue; ieee80211_assign_chanctx(local, sdata); switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: break; case NL80211_IFTYPE_ADHOC: if (sdata->vif.bss_conf.ibss_joined) WARN_ON(drv_join_ibss(local, sdata)); fallthrough; default: ieee80211_reconfig_stations(sdata); fallthrough; case NL80211_IFTYPE_AP: /* AP stations are handled later */ for (i = 0; i < IEEE80211_NUM_ACS; i++) drv_conf_tx(local, sdata, i, &sdata->tx_conf[i]); break; } /* common change flags for all interface types */ changed = BSS_CHANGED_ERP_CTS_PROT | BSS_CHANGED_ERP_PREAMBLE | BSS_CHANGED_ERP_SLOT | BSS_CHANGED_HT | BSS_CHANGED_BASIC_RATES | BSS_CHANGED_BEACON_INT | BSS_CHANGED_BSSID | BSS_CHANGED_CQM | BSS_CHANGED_QOS | BSS_CHANGED_IDLE | BSS_CHANGED_TXPOWER | BSS_CHANGED_MCAST_RATE; if (sdata->vif.mu_mimo_owner) changed |= BSS_CHANGED_MU_GROUPS; switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: changed |= BSS_CHANGED_ASSOC | BSS_CHANGED_ARP_FILTER | BSS_CHANGED_PS; /* Re-send beacon info report to the driver */ if (sdata->u.mgd.have_beacon) changed |= BSS_CHANGED_BEACON_INFO; if (sdata->vif.bss_conf.max_idle_period || sdata->vif.bss_conf.protected_keep_alive) changed |= BSS_CHANGED_KEEP_ALIVE; sdata_lock(sdata); ieee80211_bss_info_change_notify(sdata, changed); sdata_unlock(sdata); break; case NL80211_IFTYPE_OCB: changed |= BSS_CHANGED_OCB; ieee80211_bss_info_change_notify(sdata, changed); break; case NL80211_IFTYPE_ADHOC: changed |= BSS_CHANGED_IBSS; fallthrough; case NL80211_IFTYPE_AP: changed |= BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS; if (sdata->vif.bss_conf.ftm_responder == 1 && wiphy_ext_feature_isset(sdata->local->hw.wiphy, NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER)) changed |= BSS_CHANGED_FTM_RESPONDER; if (sdata->vif.type == NL80211_IFTYPE_AP) { changed |= BSS_CHANGED_AP_PROBE_RESP; if (rcu_access_pointer(sdata->u.ap.beacon)) drv_start_ap(local, sdata); } fallthrough; case NL80211_IFTYPE_MESH_POINT: if (sdata->vif.bss_conf.enable_beacon) { changed |= BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED; ieee80211_bss_info_change_notify(sdata, changed); } break; case NL80211_IFTYPE_NAN: res = ieee80211_reconfig_nan(sdata); if (res < 0) { ieee80211_handle_reconfig_failure(local); return res; } break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_WDS: WARN_ON(1); break; } } ieee80211_recalc_ps(local); /* * The sta might be in psm against the ap (e.g. because * this was the state before a hw restart), so we * explicitly send a null packet in order to make sure * it'll sync against the ap (and get out of psm). */ if (!(local->hw.conf.flags & IEEE80211_CONF_PS)) { list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_STATION) continue; if (!sdata->u.mgd.associated) continue; ieee80211_send_nullfunc(local, sdata, false); } } /* APs are now beaconing, add back stations */ mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { enum ieee80211_sta_state state; if (!sta->uploaded) continue; if (sta->sdata->vif.type != NL80211_IFTYPE_AP && sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN) continue; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) WARN_ON(drv_sta_state(local, sta->sdata, sta, state, state + 1)); } mutex_unlock(&local->sta_mtx); /* add back keys */ list_for_each_entry(sdata, &local->interfaces, list) ieee80211_reenable_keys(sdata); /* Reconfigure sched scan if it was interrupted by FW restart */ mutex_lock(&local->mtx); sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->mtx)); sched_scan_req = rcu_dereference_protected(local->sched_scan_req, lockdep_is_held(&local->mtx)); if (sched_scan_sdata && sched_scan_req) /* * Sched scan stopped, but we don't want to report it. Instead, * we're trying to reschedule. However, if more than one scan * plan was set, we cannot reschedule since we don't know which * scan plan was currently running (and some scan plans may have * already finished). */ if (sched_scan_req->n_scan_plans > 1 || __ieee80211_request_sched_scan_start(sched_scan_sdata, sched_scan_req)) { RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_stopped = true; } mutex_unlock(&local->mtx); if (sched_scan_stopped) cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); wake_up: if (local->monitors == local->open_count && local->monitors > 0) ieee80211_add_virtual_monitor(local); /* * Clear the WLAN_STA_BLOCK_BA flag so new aggregation * sessions can be established after a resume. * * Also tear down aggregation sessions since reconfiguring * them in a hardware restart scenario is not easily done * right now, and the hardware will have lost information * about the sessions, but we and the AP still think they * are active. This is really a workaround though. */ if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { if (!local->resuming) ieee80211_sta_tear_down_BA_sessions( sta, AGG_STOP_LOCAL_REQUEST); clear_sta_flag(sta, WLAN_STA_BLOCK_BA); } mutex_unlock(&local->sta_mtx); } /* * If this is for hw restart things are still running. * We may want to change that later, however. */ if (local->open_count && (!suspended || reconfig_due_to_wowlan)) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_RESTART); if (local->in_reconfig) { in_reconfig = local->in_reconfig; local->in_reconfig = false; barrier(); /* Restart deferred ROCs */ mutex_lock(&local->mtx); ieee80211_start_next_roc(local); mutex_unlock(&local->mtx); /* Requeue all works */ list_for_each_entry(sdata, &local->interfaces, list) ieee80211_queue_work(&local->hw, &sdata->work); } ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); if (in_reconfig) { list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_restart(sdata); } } if (!suspended) return 0; #ifdef CONFIG_PM /* first set suspended false, then resuming */ local->suspended = false; mb(); local->resuming = false; ieee80211_flush_completed_scan(local, false); if (local->open_count && !reconfig_due_to_wowlan) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_restart(sdata); } mod_timer(&local->sta_cleanup, jiffies + 1); #else WARN_ON(1); #endif return 0; } static void ieee80211_reconfig_disconnect(struct ieee80211_vif *vif, u8 flag) { struct ieee80211_sub_if_data *sdata; struct ieee80211_local *local; struct ieee80211_key *key; if (WARN_ON(!vif)) return; sdata = vif_to_sdata(vif); local = sdata->local; if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_RESUME && !local->resuming)) return; if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_HW_RESTART && !local->in_reconfig)) return; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return; sdata->flags |= flag; mutex_lock(&local->key_mtx); list_for_each_entry(key, &sdata->key_list, list) key->flags |= KEY_FLAG_TAINTED; mutex_unlock(&local->key_mtx); } void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif) { ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_HW_RESTART); } EXPORT_SYMBOL_GPL(ieee80211_hw_restart_disconnect); void ieee80211_resume_disconnect(struct ieee80211_vif *vif) { ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_RESUME); } EXPORT_SYMBOL_GPL(ieee80211_resume_disconnect); void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_chanctx *chanctx; mutex_lock(&local->chanctx_mtx); chanctx_conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); /* * This function can be called from a work, thus it may be possible * that the chanctx_conf is removed (due to a disconnection, for * example). * So nothing should be done in such case. */ if (!chanctx_conf) goto unlock; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); ieee80211_recalc_smps_chanctx(local, chanctx); unlock: mutex_unlock(&local->chanctx_mtx); } void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_chanctx *chanctx; mutex_lock(&local->chanctx_mtx); chanctx_conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (WARN_ON_ONCE(!chanctx_conf)) goto unlock; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); ieee80211_recalc_chanctx_min_def(local, chanctx); unlock: mutex_unlock(&local->chanctx_mtx); } size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset) { size_t pos = offset; while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC) pos += 2 + ies[pos + 1]; return pos; } static void _ieee80211_enable_rssi_reports(struct ieee80211_sub_if_data *sdata, int rssi_min_thold, int rssi_max_thold) { trace_api_enable_rssi_reports(sdata, rssi_min_thold, rssi_max_thold); if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; /* * Scale up threshold values before storing it, as the RSSI averaging * algorithm uses a scaled up value as well. Change this scaling * factor if the RSSI averaging algorithm changes. */ sdata->u.mgd.rssi_min_thold = rssi_min_thold*16; sdata->u.mgd.rssi_max_thold = rssi_max_thold*16; } void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif, int rssi_min_thold, int rssi_max_thold) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); WARN_ON(rssi_min_thold == rssi_max_thold || rssi_min_thold > rssi_max_thold); _ieee80211_enable_rssi_reports(sdata, rssi_min_thold, rssi_max_thold); } EXPORT_SYMBOL(ieee80211_enable_rssi_reports); void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); _ieee80211_enable_rssi_reports(sdata, 0, 0); } EXPORT_SYMBOL(ieee80211_disable_rssi_reports); u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap) { __le16 tmp; *pos++ = WLAN_EID_HT_CAPABILITY; *pos++ = sizeof(struct ieee80211_ht_cap); memset(pos, 0, sizeof(struct ieee80211_ht_cap)); /* capability flags */ tmp = cpu_to_le16(cap); memcpy(pos, &tmp, sizeof(u16)); pos += sizeof(u16); /* AMPDU parameters */ *pos++ = ht_cap->ampdu_factor | (ht_cap->ampdu_density << IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT); /* MCS set */ memcpy(pos, &ht_cap->mcs, sizeof(ht_cap->mcs)); pos += sizeof(ht_cap->mcs); /* extended capabilities */ pos += sizeof(__le16); /* BF capabilities */ pos += sizeof(__le32); /* antenna selection */ pos += sizeof(u8); return pos; } u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap) { __le32 tmp; *pos++ = WLAN_EID_VHT_CAPABILITY; *pos++ = sizeof(struct ieee80211_vht_cap); memset(pos, 0, sizeof(struct ieee80211_vht_cap)); /* capability flags */ tmp = cpu_to_le32(cap); memcpy(pos, &tmp, sizeof(u32)); pos += sizeof(u32); /* VHT MCS set */ memcpy(pos, &vht_cap->vht_mcs, sizeof(vht_cap->vht_mcs)); pos += sizeof(vht_cap->vht_mcs); return pos; } u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype) { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_supported_band *sband; u8 n; sband = ieee80211_get_sband(sdata); if (!sband) return 0; he_cap = ieee80211_get_he_iftype_cap(sband, iftype); if (!he_cap) return 0; n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem); return 2 + 1 + sizeof(he_cap->he_cap_elem) + n + ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); } u8 *ieee80211_ie_build_he_cap(u32 disable_flags, u8 *pos, const struct ieee80211_sta_he_cap *he_cap, u8 *end) { struct ieee80211_he_cap_elem elem; u8 n; u8 ie_len; u8 *orig_pos = pos; /* Make sure we have place for the IE */ /* * TODO: the 1 added is because this temporarily is under the EXTENSION * IE. Get rid of it when it moves. */ if (!he_cap) return orig_pos; /* modify on stack first to calculate 'n' and 'ie_len' correctly */ elem = he_cap->he_cap_elem; if (disable_flags & IEEE80211_STA_DISABLE_40MHZ) elem.phy_cap_info[0] &= ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G); if (disable_flags & IEEE80211_STA_DISABLE_160MHZ) elem.phy_cap_info[0] &= ~IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; if (disable_flags & IEEE80211_STA_DISABLE_80P80MHZ) elem.phy_cap_info[0] &= ~IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G; n = ieee80211_he_mcs_nss_size(&elem); ie_len = 2 + 1 + sizeof(he_cap->he_cap_elem) + n + ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); if ((end - pos) < ie_len) return orig_pos; *pos++ = WLAN_EID_EXTENSION; pos++; /* We'll set the size later below */ *pos++ = WLAN_EID_EXT_HE_CAPABILITY; /* Fixed data */ memcpy(pos, &elem, sizeof(elem)); pos += sizeof(elem); memcpy(pos, &he_cap->he_mcs_nss_supp, n); pos += n; /* Check if PPE Threshold should be present */ if ((he_cap->he_cap_elem.phy_cap_info[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) goto end; /* * Calculate how many PPET16/PPET8 pairs are to come. Algorithm: * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK) */ n = hweight8(he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >> IEEE80211_PPE_THRES_NSS_POS)); /* * Each pair is 6 bits, and we need to add the 7 "header" bits to the * total size. */ n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; n = DIV_ROUND_UP(n, 8); /* Copy PPE Thresholds */ memcpy(pos, &he_cap->ppe_thres, n); pos += n; end: orig_pos[1] = (pos - orig_pos) - 2; return pos; } void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_supported_band *sband; const struct ieee80211_sband_iftype_data *iftd; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); u8 *pos; u16 cap; if (!cfg80211_any_usable_channels(sdata->local->hw.wiphy, BIT(NL80211_BAND_6GHZ), IEEE80211_CHAN_NO_HE)) return; sband = sdata->local->hw.wiphy->bands[NL80211_BAND_6GHZ]; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (!iftd) return; /* Check for device HE 6 GHz capability before adding element */ if (!iftd->he_6ghz_capa.capa) return; cap = le16_to_cpu(iftd->he_6ghz_capa.capa); cap &= ~IEEE80211_HE_6GHZ_CAP_SM_PS; switch (sdata->smps_mode) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF: cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_STATIC: cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC, IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_DYNAMIC: cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC, IEEE80211_HE_6GHZ_CAP_SM_PS); break; } pos = skb_put(skb, 2 + 1 + sizeof(cap)); ieee80211_write_he_6ghz_cap(pos, cpu_to_le16(cap), pos + 2 + 1 + sizeof(cap)); } u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) { struct ieee80211_ht_operation *ht_oper; /* Build HT Information */ *pos++ = WLAN_EID_HT_OPERATION; *pos++ = sizeof(struct ieee80211_ht_operation); ht_oper = (struct ieee80211_ht_operation *)pos; ht_oper->primary_chan = ieee80211_frequency_to_channel( chandef->chan->center_freq); switch (chandef->width) { case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_40: if (chandef->center_freq1 > chandef->chan->center_freq) ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_BELOW; break; default: ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE; break; } if (ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 && chandef->width != NL80211_CHAN_WIDTH_20_NOHT && chandef->width != NL80211_CHAN_WIDTH_20) ht_oper->ht_param |= IEEE80211_HT_PARAM_CHAN_WIDTH_ANY; if (rifs_mode) ht_oper->ht_param |= IEEE80211_HT_PARAM_RIFS_MODE; ht_oper->operation_mode = cpu_to_le16(prot_mode); ht_oper->stbc_param = 0x0000; /* It seems that Basic MCS set and Supported MCS set are identical for the first 10 bytes */ memset(&ht_oper->basic_set, 0, 16); memcpy(&ht_oper->basic_set, &ht_cap->mcs, 10); return pos + sizeof(struct ieee80211_ht_operation); } void ieee80211_ie_build_wide_bw_cs(u8 *pos, const struct cfg80211_chan_def *chandef) { *pos++ = WLAN_EID_WIDE_BW_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ /* New channel width */ switch (chandef->width) { case NL80211_CHAN_WIDTH_80: *pos++ = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_160: *pos++ = IEEE80211_VHT_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80P80: *pos++ = IEEE80211_VHT_CHANWIDTH_80P80MHZ; break; default: *pos++ = IEEE80211_VHT_CHANWIDTH_USE_HT; } /* new center frequency segment 0 */ *pos++ = ieee80211_frequency_to_channel(chandef->center_freq1); /* new center frequency segment 1 */ if (chandef->center_freq2) *pos++ = ieee80211_frequency_to_channel(chandef->center_freq2); else *pos++ = 0; } u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef) { struct ieee80211_vht_operation *vht_oper; *pos++ = WLAN_EID_VHT_OPERATION; *pos++ = sizeof(struct ieee80211_vht_operation); vht_oper = (struct ieee80211_vht_operation *)pos; vht_oper->center_freq_seg0_idx = ieee80211_frequency_to_channel( chandef->center_freq1); if (chandef->center_freq2) vht_oper->center_freq_seg1_idx = ieee80211_frequency_to_channel(chandef->center_freq2); else vht_oper->center_freq_seg1_idx = 0x00; switch (chandef->width) { case NL80211_CHAN_WIDTH_160: /* * Convert 160 MHz channel width to new style as interop * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; vht_oper->center_freq_seg1_idx = vht_oper->center_freq_seg0_idx; if (chandef->chan->center_freq < chandef->center_freq1) vht_oper->center_freq_seg0_idx -= 8; else vht_oper->center_freq_seg0_idx += 8; break; case NL80211_CHAN_WIDTH_80P80: /* * Convert 80+80 MHz channel width to new style as interop * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_80: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; default: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT; break; } /* don't require special VHT peer rates */ vht_oper->basic_mcs_set = cpu_to_le16(0xffff); return pos + sizeof(struct ieee80211_vht_operation); } u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; struct ieee80211_he_6ghz_oper *he_6ghz_op; u32 he_oper_params; u8 ie_len = 1 + sizeof(struct ieee80211_he_operation); if (chandef->chan->band == NL80211_BAND_6GHZ) ie_len += sizeof(struct ieee80211_he_6ghz_oper); *pos++ = WLAN_EID_EXTENSION; *pos++ = ie_len; *pos++ = WLAN_EID_EXT_HE_OPERATION; he_oper_params = 0; he_oper_params |= u32_encode_bits(1023, /* disabled */ IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_ER_SU_DISABLE); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); if (chandef->chan->band == NL80211_BAND_6GHZ) he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO); he_oper = (struct ieee80211_he_operation *)pos; he_oper->he_oper_params = cpu_to_le32(he_oper_params); /* don't require special HE peer rates */ he_oper->he_mcs_nss_set = cpu_to_le16(0xffff); pos += sizeof(struct ieee80211_he_operation); if (chandef->chan->band != NL80211_BAND_6GHZ) goto out; /* TODO add VHT operational */ he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos; he_6ghz_op->minrate = 6; /* 6 Mbps */ he_6ghz_op->primary = ieee80211_frequency_to_channel(chandef->chan->center_freq); he_6ghz_op->ccfs0 = ieee80211_frequency_to_channel(chandef->center_freq1); if (chandef->center_freq2) he_6ghz_op->ccfs1 = ieee80211_frequency_to_channel(chandef->center_freq2); else he_6ghz_op->ccfs1 = 0; switch (chandef->width) { case NL80211_CHAN_WIDTH_160: /* Convert 160 MHz channel width to new style as interop * workaround. */ he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0; if (chandef->chan->center_freq < chandef->center_freq1) he_6ghz_op->ccfs0 -= 8; else he_6ghz_op->ccfs0 += 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; break; default: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; break; } pos += sizeof(struct ieee80211_he_6ghz_oper); out: return pos; } bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef) { enum nl80211_channel_type channel_type; if (!ht_oper) return false; switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_NONE: channel_type = NL80211_CHAN_HT20; break; case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: channel_type = NL80211_CHAN_HT40PLUS; break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: channel_type = NL80211_CHAN_HT40MINUS; break; default: channel_type = NL80211_CHAN_NO_HT; return false; } cfg80211_chandef_create(chandef, chandef->chan, channel_type); return true; } bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef) { struct cfg80211_chan_def new = *chandef; int cf0, cf1; int ccfs0, ccfs1, ccfs2; int ccf0, ccf1; u32 vht_cap; bool support_80_80 = false; bool support_160 = false; u8 ext_nss_bw_supp = u32_get_bits(vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); u8 supp_chwidth = u32_get_bits(vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); if (!oper || !htop) return false; vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap; support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK | IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)); support_80_80 = ((vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >> IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1)); ccfs0 = oper->center_freq_seg0_idx; ccfs1 = oper->center_freq_seg1_idx; ccfs2 = (le16_to_cpu(htop->operation_mode) & IEEE80211_HT_OP_MODE_CCFS2_MASK) >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT; ccf0 = ccfs0; /* if not supported, parse as though we didn't understand it */ if (!ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) ext_nss_bw_supp = 0; /* * Cf. IEEE 802.11 Table 9-250 * * We really just consider that because it's inefficient to connect * at a higher bandwidth than we'll actually be able to use. */ switch ((supp_chwidth << 4) | ext_nss_bw_supp) { default: case 0x00: ccf1 = 0; support_160 = false; support_80_80 = false; break; case 0x01: support_80_80 = false; fallthrough; case 0x02: case 0x03: ccf1 = ccfs2; break; case 0x10: ccf1 = ccfs1; break; case 0x11: case 0x12: if (!ccfs1) ccf1 = ccfs2; else ccf1 = ccfs1; break; case 0x13: case 0x20: case 0x23: ccf1 = ccfs1; break; } cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band); cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band); switch (oper->chan_width) { case IEEE80211_VHT_CHANWIDTH_USE_HT: /* just use HT information directly */ break; case IEEE80211_VHT_CHANWIDTH_80MHZ: new.width = NL80211_CHAN_WIDTH_80; new.center_freq1 = cf0; /* If needed, adjust based on the newer interop workaround. */ if (ccf1) { unsigned int diff; diff = abs(ccf1 - ccf0); if ((diff == 8) && support_160) { new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf1; } else if ((diff > 8) && support_80_80) { new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq2 = cf1; } } break; case IEEE80211_VHT_CHANWIDTH_160MHZ: /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf0; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq1 = cf0; new.center_freq2 = cf1; break; default: return false; } if (!cfg80211_chandef_valid(&new)) return false; *chandef = new; return true; } bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, const struct ieee80211_he_operation *he_oper, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); const struct ieee80211_sta_he_cap *he_cap; struct cfg80211_chan_def he_chandef = *chandef; const struct ieee80211_he_6ghz_oper *he_6ghz_oper; bool support_80_80, support_160; u8 he_phy_cap; u32 freq; if (chandef->chan->band != NL80211_BAND_6GHZ) return true; sband = local->hw.wiphy->bands[NL80211_BAND_6GHZ]; he_cap = ieee80211_get_he_iftype_cap(sband, iftype); if (!he_cap) { sdata_info(sdata, "Missing iftype sband data/HE cap"); return false; } he_phy_cap = he_cap->he_cap_elem.phy_cap_info[0]; support_160 = he_phy_cap & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; support_80_80 = he_phy_cap & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G; if (!he_oper) { sdata_info(sdata, "HE is not advertised on (on %d MHz), expect issues\n", chandef->chan->center_freq); return false; } he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); if (!he_6ghz_oper) { sdata_info(sdata, "HE 6GHz operation missing (on %d MHz), expect issues\n", chandef->chan->center_freq); return false; } freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary, NL80211_BAND_6GHZ); he_chandef.chan = ieee80211_get_channel(sdata->local->hw.wiphy, freq); switch (u8_get_bits(he_6ghz_oper->control, IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) { case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ: he_chandef.width = NL80211_CHAN_WIDTH_20; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ: he_chandef.width = NL80211_CHAN_WIDTH_40; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ: he_chandef.width = NL80211_CHAN_WIDTH_80; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ: he_chandef.width = NL80211_CHAN_WIDTH_80; if (!he_6ghz_oper->ccfs1) break; if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) { if (support_160) he_chandef.width = NL80211_CHAN_WIDTH_160; } else { if (support_80_80) he_chandef.width = NL80211_CHAN_WIDTH_80P80; } break; } if (he_chandef.width == NL80211_CHAN_WIDTH_160) { he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, NL80211_BAND_6GHZ); } else { he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0, NL80211_BAND_6GHZ); if (support_80_80 || support_160) he_chandef.center_freq2 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, NL80211_BAND_6GHZ); } if (!cfg80211_chandef_valid(&he_chandef)) { sdata_info(sdata, "HE 6GHz operation resulted in invalid chandef: %d MHz/%d/%d MHz/%d MHz\n", he_chandef.chan ? he_chandef.chan->center_freq : 0, he_chandef.width, he_chandef.center_freq1, he_chandef.center_freq2); return false; } *chandef = he_chandef; return true; } bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef) { u32 oper_freq; if (!oper) return false; switch (FIELD_GET(S1G_OPER_CH_WIDTH_OPER, oper->ch_width)) { case IEEE80211_S1G_CHANWIDTH_1MHZ: chandef->width = NL80211_CHAN_WIDTH_1; break; case IEEE80211_S1G_CHANWIDTH_2MHZ: chandef->width = NL80211_CHAN_WIDTH_2; break; case IEEE80211_S1G_CHANWIDTH_4MHZ: chandef->width = NL80211_CHAN_WIDTH_4; break; case IEEE80211_S1G_CHANWIDTH_8MHZ: chandef->width = NL80211_CHAN_WIDTH_8; break; case IEEE80211_S1G_CHANWIDTH_16MHZ: chandef->width = NL80211_CHAN_WIDTH_16; break; default: return false; } oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch, NL80211_BAND_S1GHZ); chandef->center_freq1 = KHZ_TO_MHZ(oper_freq); chandef->freq1_offset = oper_freq % 1000; return true; } int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) { u32 rate_flags = ieee80211_chandef_rate_flags(chandef); int shift = ieee80211_chandef_get_shift(chandef); struct ieee80211_rate *br; int brate, rate, i, j, count = 0; *rates = 0; for (i = 0; i < srates_len; i++) { rate = srates[i] & 0x7f; for (j = 0; j < sband->n_bitrates; j++) { br = &sband->bitrates[j]; if ((rate_flags & br->flags) != rate_flags) continue; brate = DIV_ROUND_UP(br->bitrate, (1 << shift) * 5); if (brate == rate) { *rates |= BIT(j); count++; break; } } } return count; } int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; int rate, shift; u8 i, rates, *pos; u32 basic_rates = sdata->vif.bss_conf.basic_rates; u32 rate_flags; shift = ieee80211_vif_get_shift(&sdata->vif); rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); sband = local->hw.wiphy->bands[band]; rates = 0; for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; rates++; } if (rates > 8) rates = 8; if (skb_tailroom(skb) < rates + 2) return -ENOMEM; pos = skb_put(skb, rates + 2); *pos++ = WLAN_EID_SUPP_RATES; *pos++ = rates; for (i = 0; i < rates; i++) { u8 basic = 0; if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (need_basic && basic_rates & BIT(i)) basic = 0x80; rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5 * (1 << shift)); *pos++ = basic | (u8) rate; } return 0; } int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; int rate, shift; u8 i, exrates, *pos; u32 basic_rates = sdata->vif.bss_conf.basic_rates; u32 rate_flags; rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); shift = ieee80211_vif_get_shift(&sdata->vif); sband = local->hw.wiphy->bands[band]; exrates = 0; for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; exrates++; } if (exrates > 8) exrates -= 8; else exrates = 0; if (skb_tailroom(skb) < exrates + 2) return -ENOMEM; if (exrates) { pos = skb_put(skb, exrates + 2); *pos++ = WLAN_EID_EXT_SUPP_RATES; *pos++ = exrates; for (i = 8; i < sband->n_bitrates; i++) { u8 basic = 0; if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (need_basic && basic_rates & BIT(i)) basic = 0x80; rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5 * (1 << shift)); *pos++ = basic | (u8) rate; } } return 0; } int ieee80211_ave_rssi(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) { /* non-managed type inferfaces */ return 0; } return -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal); } EXPORT_SYMBOL_GPL(ieee80211_ave_rssi); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs) { if (!mcs) return 1; /* TODO: consider rx_highest */ if (mcs->rx_mask[3]) return 4; if (mcs->rx_mask[2]) return 3; if (mcs->rx_mask[1]) return 2; return 1; } /** * ieee80211_calculate_rx_timestamp - calculate timestamp in frame * @local: mac80211 hw info struct * @status: RX status * @mpdu_len: total MPDU length (including FCS) * @mpdu_offset: offset into MPDU to calculate timestamp at * * This function calculates the RX timestamp at the given MPDU offset, taking * into account what the RX timestamp was. An offset of 0 will just normalize * the timestamp to TSF at beginning of MPDU reception. */ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset) { u64 ts = status->mactime; struct rate_info ri; u16 rate; u8 n_ltf; if (WARN_ON(!ieee80211_have_rx_timestamp(status))) return 0; memset(&ri, 0, sizeof(ri)); ri.bw = status->bw; /* Fill cfg80211 rate info */ switch (status->encoding) { case RX_ENC_HE: ri.flags |= RATE_INFO_FLAGS_HE_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; ri.he_ru_alloc = status->he_ru; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11ax_D6.0, section 27.3.4 for * VHT PPDU format. */ if (status->flag & RX_FLAG_MACTIME_PLCP_START) { mpdu_offset += 2; ts += 36; /* * TODO: * For HE MU PPDU, add the HE-SIG-B. * For HE ER PPDU, add 8us for the HE-SIG-A. * For HE TB PPDU, add 4us for the HE-STF. * Add the HE-LTF durations - variable. */ } break; case RX_ENC_HT: ri.mcs = status->rate_idx; ri.flags |= RATE_INFO_FLAGS_MCS; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11REVmd_D3.0, section 19.3.2 for * HT PPDU format. */ if (status->flag & RX_FLAG_MACTIME_PLCP_START) { mpdu_offset += 2; if (status->enc_flags & RX_ENC_FLAG_HT_GF) ts += 24; else ts += 32; /* * Add Data HT-LTFs per streams * TODO: add Extension HT-LTFs, 4us per LTF */ n_ltf = ((ri.mcs >> 3) & 3) + 1; n_ltf = n_ltf == 3 ? 4 : n_ltf; ts += n_ltf * 4; } break; case RX_ENC_VHT: ri.flags |= RATE_INFO_FLAGS_VHT_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11REVmd_D3.0, section 21.3.2 for * VHT PPDU format. */ if (status->flag & RX_FLAG_MACTIME_PLCP_START) { mpdu_offset += 2; ts += 36; /* * Add VHT-LTFs per streams */ n_ltf = (ri.nss != 1) && (ri.nss % 2) ? ri.nss + 1 : ri.nss; ts += 4 * n_ltf; } break; default: WARN_ON(1); fallthrough; case RX_ENC_LEGACY: { struct ieee80211_supported_band *sband; int shift = 0; int bitrate; switch (status->bw) { case RATE_INFO_BW_10: shift = 1; break; case RATE_INFO_BW_5: shift = 2; break; } sband = local->hw.wiphy->bands[status->band]; bitrate = sband->bitrates[status->rate_idx].bitrate; ri.legacy = DIV_ROUND_UP(bitrate, (1 << shift)); if (status->flag & RX_FLAG_MACTIME_PLCP_START) { if (status->band == NL80211_BAND_5GHZ) { ts += 20 << shift; mpdu_offset += 2; } else if (status->enc_flags & RX_ENC_FLAG_SHORTPRE) { ts += 96; } else { ts += 192; } } break; } } rate = cfg80211_calculate_bitrate(&ri); if (WARN_ONCE(!rate, "Invalid bitrate: flags=0x%llx, idx=%d, vht_nss=%d\n", (unsigned long long)status->flag, status->rate_idx, status->nss)) return 0; /* rewind from end of MPDU */ if (status->flag & RX_FLAG_MACTIME_END) ts -= mpdu_len * 8 * 10 / rate; ts += mpdu_offset * 8 * 10 / rate; return ts; } void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; struct cfg80211_chan_def chandef; /* for interface list, to avoid linking iflist_mtx and chanctx_mtx */ lockdep_assert_wiphy(local->hw.wiphy); mutex_lock(&local->mtx); list_for_each_entry(sdata, &local->interfaces, list) { /* it might be waiting for the local->mtx, but then * by the time it gets it, sdata->wdev.cac_started * will no longer be true */ cancel_delayed_work(&sdata->dfs_cac_timer_work); if (sdata->wdev.cac_started) { chandef = sdata->vif.bss_conf.chandef; ieee80211_vif_release_channel(sdata); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL); } } mutex_unlock(&local->mtx); } void ieee80211_dfs_radar_detected_work(struct work_struct *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, radar_detected_work); struct cfg80211_chan_def chandef = local->hw.conf.chandef; struct ieee80211_chanctx *ctx; int num_chanctx = 0; mutex_lock(&local->chanctx_mtx); list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) continue; num_chanctx++; chandef = ctx->conf.def; } mutex_unlock(&local->chanctx_mtx); wiphy_lock(local->hw.wiphy); ieee80211_dfs_cac_cancel(local); wiphy_unlock(local->hw.wiphy); if (num_chanctx > 1) /* XXX: multi-channel is not supported yet */ WARN_ON(1); else cfg80211_radar_event(local->hw.wiphy, &chandef, GFP_KERNEL); } void ieee80211_radar_detected(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_radar_detected(local); schedule_work(&local->radar_detected_work); } EXPORT_SYMBOL(ieee80211_radar_detected); u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c) { u32 ret; int tmp; switch (c->width) { case NL80211_CHAN_WIDTH_20: c->width = NL80211_CHAN_WIDTH_20_NOHT; ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; break; case NL80211_CHAN_WIDTH_40: c->width = NL80211_CHAN_WIDTH_20; c->center_freq1 = c->chan->center_freq; ret = IEEE80211_STA_DISABLE_40MHZ | IEEE80211_STA_DISABLE_VHT; break; case NL80211_CHAN_WIDTH_80: tmp = (30 + c->chan->center_freq - c->center_freq1)/20; /* n_P40 */ tmp /= 2; /* freq_P40 */ c->center_freq1 = c->center_freq1 - 20 + 40 * tmp; c->width = NL80211_CHAN_WIDTH_40; ret = IEEE80211_STA_DISABLE_VHT; break; case NL80211_CHAN_WIDTH_80P80: c->center_freq2 = 0; c->width = NL80211_CHAN_WIDTH_80; ret = IEEE80211_STA_DISABLE_80P80MHZ | IEEE80211_STA_DISABLE_160MHZ; break; case NL80211_CHAN_WIDTH_160: /* n_P20 */ tmp = (70 + c->chan->center_freq - c->center_freq1)/20; /* n_P80 */ tmp /= 4; c->center_freq1 = c->center_freq1 - 40 + 80 * tmp; c->width = NL80211_CHAN_WIDTH_80; ret = IEEE80211_STA_DISABLE_80P80MHZ | IEEE80211_STA_DISABLE_160MHZ; break; default: case NL80211_CHAN_WIDTH_20_NOHT: WARN_ON_ONCE(1); c->width = NL80211_CHAN_WIDTH_20_NOHT; ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; break; case NL80211_CHAN_WIDTH_1: case NL80211_CHAN_WIDTH_2: case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); /* keep c->width */ ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; break; } WARN_ON_ONCE(!cfg80211_chandef_valid(c)); return ret; } /* * Returns true if smps_mode_new is strictly more restrictive than * smps_mode_old. */ bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old, enum ieee80211_smps_mode smps_mode_new) { if (WARN_ON_ONCE(smps_mode_old == IEEE80211_SMPS_AUTOMATIC || smps_mode_new == IEEE80211_SMPS_AUTOMATIC)) return false; switch (smps_mode_old) { case IEEE80211_SMPS_STATIC: return false; case IEEE80211_SMPS_DYNAMIC: return smps_mode_new == IEEE80211_SMPS_STATIC; case IEEE80211_SMPS_OFF: return smps_mode_new != IEEE80211_SMPS_OFF; default: WARN_ON(1); } return false; } int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings) { struct sk_buff *skb; struct ieee80211_mgmt *mgmt; struct ieee80211_local *local = sdata->local; int freq; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.chan_switch); u8 *pos; if (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT) return -EOPNOTSUPP; skb = dev_alloc_skb(local->tx_headroom + hdr_len + 5 + /* channel switch announcement element */ 3 + /* secondary channel offset element */ 5 + /* wide bandwidth channel switch announcement */ 8); /* mesh channel switch parameters element */ if (!skb) return -ENOMEM; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); eth_broadcast_addr(mgmt->da); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); if (ieee80211_vif_is_mesh(&sdata->vif)) { memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); } else { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); } mgmt->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; mgmt->u.action.u.chan_switch.action_code = WLAN_ACTION_SPCT_CHL_SWITCH; pos = skb_put(skb, 5); *pos++ = WLAN_EID_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ *pos++ = csa_settings->block_tx ? 1 : 0; /* CSA mode */ freq = csa_settings->chandef.chan->center_freq; *pos++ = ieee80211_frequency_to_channel(freq); /* channel */ *pos++ = csa_settings->count; /* count */ if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_40) { enum nl80211_channel_type ch_type; skb_put(skb, 3); *pos++ = WLAN_EID_SECONDARY_CHANNEL_OFFSET; /* EID */ *pos++ = 1; /* IE length */ ch_type = cfg80211_get_chandef_type(&csa_settings->chandef); if (ch_type == NL80211_CHAN_HT40PLUS) *pos++ = IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else *pos++ = IEEE80211_HT_PARAM_CHA_SEC_BELOW; } if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; skb_put(skb, 8); *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; /* EID */ *pos++ = 6; /* IE length */ *pos++ = sdata->u.mesh.mshcfg.dot11MeshTTL; /* Mesh TTL */ *pos = 0x00; /* Mesh Flag: Tx Restrict, Initiator, Reason */ *pos |= WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR; *pos++ |= csa_settings->block_tx ? WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT : 0x00; put_unaligned_le16(WLAN_REASON_MESH_CHAN, pos); /* Reason Cd */ pos += 2; put_unaligned_le16(ifmsh->pre_value, pos);/* Precedence Value */ pos += 2; } if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_80 || csa_settings->chandef.width == NL80211_CHAN_WIDTH_80P80 || csa_settings->chandef.width == NL80211_CHAN_WIDTH_160) { skb_put(skb, 5); ieee80211_ie_build_wide_bw_cs(pos, &csa_settings->chandef); } ieee80211_tx_skb(sdata, skb); return 0; } bool ieee80211_cs_valid(const struct ieee80211_cipher_scheme *cs) { return !(cs == NULL || cs->cipher == 0 || cs->hdr_len < cs->pn_len + cs->pn_off || cs->hdr_len <= cs->key_idx_off || cs->key_idx_shift > 7 || cs->key_idx_mask == 0); } bool ieee80211_cs_list_valid(const struct ieee80211_cipher_scheme *cs, int n) { int i; /* Ensure we have enough iftype bitmap space for all iftype values */ WARN_ON((NUM_NL80211_IFTYPES / 8 + 1) > sizeof(cs[0].iftype)); for (i = 0; i < n; i++) if (!ieee80211_cs_valid(&cs[i])) return false; return true; } const struct ieee80211_cipher_scheme * ieee80211_cs_get(struct ieee80211_local *local, u32 cipher, enum nl80211_iftype iftype) { const struct ieee80211_cipher_scheme *l = local->hw.cipher_schemes; int n = local->hw.n_cipher_schemes; int i; const struct ieee80211_cipher_scheme *cs = NULL; for (i = 0; i < n; i++) { if (l[i].cipher == cipher) { cs = &l[i]; break; } } if (!cs || !(cs->iftype & BIT(iftype))) return NULL; return cs; } int ieee80211_cs_headroom(struct ieee80211_local *local, struct cfg80211_crypto_settings *crypto, enum nl80211_iftype iftype) { const struct ieee80211_cipher_scheme *cs; int headroom = IEEE80211_ENCRYPT_HEADROOM; int i; for (i = 0; i < crypto->n_ciphers_pairwise; i++) { cs = ieee80211_cs_get(local, crypto->ciphers_pairwise[i], iftype); if (cs && headroom < cs->hdr_len) headroom = cs->hdr_len; } cs = ieee80211_cs_get(local, crypto->cipher_group, iftype); if (cs && headroom < cs->hdr_len) headroom = cs->hdr_len; return headroom; } static bool ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i) { s32 end = data->desc[i].start + data->desc[i].duration - (tsf + 1); int skip; if (end > 0) return false; /* One shot NOA */ if (data->count[i] == 1) return false; if (data->desc[i].interval == 0) return false; /* End time is in the past, check for repetitions */ skip = DIV_ROUND_UP(-end, data->desc[i].interval); if (data->count[i] < 255) { if (data->count[i] <= skip) { data->count[i] = 0; return false; } data->count[i] -= skip; } data->desc[i].start += skip * data->desc[i].interval; return true; } static bool ieee80211_extend_absent_time(struct ieee80211_noa_data *data, u32 tsf, s32 *offset) { bool ret = false; int i; for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { s32 cur; if (!data->count[i]) continue; if (ieee80211_extend_noa_desc(data, tsf + *offset, i)) ret = true; cur = data->desc[i].start - tsf; if (cur > *offset) continue; cur = data->desc[i].start + data->desc[i].duration - tsf; if (cur > *offset) *offset = cur; } return ret; } static u32 ieee80211_get_noa_absent_time(struct ieee80211_noa_data *data, u32 tsf) { s32 offset = 0; int tries = 0; /* * arbitrary limit, used to avoid infinite loops when combined NoA * descriptors cover the full time period. */ int max_tries = 5; ieee80211_extend_absent_time(data, tsf, &offset); do { if (!ieee80211_extend_absent_time(data, tsf, &offset)) break; tries++; } while (tries < max_tries); return offset; } void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf) { u32 next_offset = BIT(31) - 1; int i; data->absent = 0; data->has_next_tsf = false; for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { s32 start; if (!data->count[i]) continue; ieee80211_extend_noa_desc(data, tsf, i); start = data->desc[i].start - tsf; if (start <= 0) data->absent |= BIT(i); if (next_offset > start) next_offset = start; data->has_next_tsf = true; } if (data->absent) next_offset = ieee80211_get_noa_absent_time(data, tsf); data->next_tsf = tsf + next_offset; } EXPORT_SYMBOL(ieee80211_update_p2p_noa); int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, struct ieee80211_noa_data *data, u32 tsf) { int ret = 0; int i; memset(data, 0, sizeof(*data)); for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { const struct ieee80211_p2p_noa_desc *desc = &attr->desc[i]; if (!desc->count || !desc->duration) continue; data->count[i] = desc->count; data->desc[i].start = le32_to_cpu(desc->start_time); data->desc[i].duration = le32_to_cpu(desc->duration); data->desc[i].interval = le32_to_cpu(desc->interval); if (data->count[i] > 1 && data->desc[i].interval < data->desc[i].duration) continue; ieee80211_extend_noa_desc(data, tsf, i); ret++; } if (ret) ieee80211_update_p2p_noa(data, tsf); return ret; } EXPORT_SYMBOL(ieee80211_parse_p2p_noa); void ieee80211_recalc_dtim(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { u64 tsf = drv_get_tsf(local, sdata); u64 dtim_count = 0; u16 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; u8 dtim_period = sdata->vif.bss_conf.dtim_period; struct ps_data *ps; u8 bcns_from_dtim; if (tsf == -1ULL || !beacon_int || !dtim_period) return; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (!sdata->bss) return; ps = &sdata->bss->ps; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { ps = &sdata->u.mesh.ps; } else { return; } /* * actually finds last dtim_count, mac80211 will update in * __beacon_add_tim(). * dtim_count = dtim_period - (tsf / bcn_int) % dtim_period */ do_div(tsf, beacon_int); bcns_from_dtim = do_div(tsf, dtim_period); /* just had a DTIM */ if (!bcns_from_dtim) dtim_count = 0; else dtim_count = dtim_period - bcns_from_dtim; ps->dtim_count = dtim_count; } static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_sub_if_data *sdata; u8 radar_detect = 0; lockdep_assert_held(&local->chanctx_mtx); if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)) return 0; list_for_each_entry(sdata, &ctx->reserved_vifs, reserved_chanctx_list) if (sdata->reserved_radar_required) radar_detect |= BIT(sdata->reserved_chandef.width); /* * An in-place reservation context should not have any assigned vifs * until it replaces the other context. */ WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER && !list_empty(&ctx->assigned_vifs)); list_for_each_entry(sdata, &ctx->assigned_vifs, assigned_chanctx_list) if (sdata->radar_required) radar_detect |= BIT(sdata->vif.bss_conf.chandef.width); return radar_detect; } int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode chanmode, u8 radar_detect) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *sdata_iter; enum nl80211_iftype iftype = sdata->wdev.iftype; struct ieee80211_chanctx *ctx; int total = 1; struct iface_combination_params params = { .radar_detect = radar_detect, }; lockdep_assert_held(&local->chanctx_mtx); if (WARN_ON(hweight32(radar_detect) > 1)) return -EINVAL; if (WARN_ON(chandef && chanmode == IEEE80211_CHANCTX_SHARED && !chandef->chan)) return -EINVAL; if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) return -EINVAL; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_MESH_POINT) { /* * always passing this is harmless, since it'll be the * same value that cfg80211 finds if it finds the same * interface ... and that's always allowed */ params.new_beacon_int = sdata->vif.bss_conf.beacon_int; } /* Always allow software iftypes */ if (cfg80211_iftype_allowed(local->hw.wiphy, iftype, 0, 1)) { if (radar_detect) return -EINVAL; return 0; } if (chandef) params.num_different_channels = 1; if (iftype != NL80211_IFTYPE_UNSPECIFIED) params.iftype_num[iftype] = 1; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) continue; params.radar_detect |= ieee80211_chanctx_radar_detect(local, ctx); if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) { params.num_different_channels++; continue; } if (chandef && chanmode == IEEE80211_CHANCTX_SHARED && cfg80211_chandef_compatible(chandef, &ctx->conf.def)) continue; params.num_different_channels++; } list_for_each_entry_rcu(sdata_iter, &local->interfaces, list) { struct wireless_dev *wdev_iter; wdev_iter = &sdata_iter->wdev; if (sdata_iter == sdata || !ieee80211_sdata_running(sdata_iter) || cfg80211_iftype_allowed(local->hw.wiphy, wdev_iter->iftype, 0, 1)) continue; params.iftype_num[wdev_iter->iftype]++; total++; } if (total == 1 && !params.radar_detect) return 0; return cfg80211_check_combinations(local->hw.wiphy, ¶ms); } static void ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c, void *data) { u32 *max_num_different_channels = data; *max_num_different_channels = max(*max_num_different_channels, c->num_different_channels); } int ieee80211_max_num_channels(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; u32 max_num_different_channels = 1; int err; struct iface_combination_params params = {0}; lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) continue; params.num_different_channels++; params.radar_detect |= ieee80211_chanctx_radar_detect(local, ctx); } list_for_each_entry_rcu(sdata, &local->interfaces, list) params.iftype_num[sdata->wdev.iftype]++; err = cfg80211_iter_combinations(local->hw.wiphy, ¶ms, ieee80211_iter_max_chans, &max_num_different_channels); if (err < 0) return err; return max_num_different_channels; } void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_s1g_cap *caps, struct sk_buff *skb) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_s1g_cap s1g_capab; u8 *pos; int i; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; if (!caps->s1g) return; memcpy(s1g_capab.capab_info, caps->cap, sizeof(caps->cap)); memcpy(s1g_capab.supp_mcs_nss, caps->nss_mcs, sizeof(caps->nss_mcs)); /* override the capability info */ for (i = 0; i < sizeof(ifmgd->s1g_capa.capab_info); i++) { u8 mask = ifmgd->s1g_capa_mask.capab_info[i]; s1g_capab.capab_info[i] &= ~mask; s1g_capab.capab_info[i] |= ifmgd->s1g_capa.capab_info[i] & mask; } /* then MCS and NSS set */ for (i = 0; i < sizeof(ifmgd->s1g_capa.supp_mcs_nss); i++) { u8 mask = ifmgd->s1g_capa_mask.supp_mcs_nss[i]; s1g_capab.supp_mcs_nss[i] &= ~mask; s1g_capab.supp_mcs_nss[i] |= ifmgd->s1g_capa.supp_mcs_nss[i] & mask; } pos = skb_put(skb, 2 + sizeof(s1g_capab)); *pos++ = WLAN_EID_S1G_CAPABILITIES; *pos++ = sizeof(s1g_capab); memcpy(pos, &s1g_capab, sizeof(s1g_capab)); } void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { u8 *pos = skb_put(skb, 3); *pos++ = WLAN_EID_AID_REQUEST; *pos++ = 1; *pos++ = 0; } u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo) { *buf++ = WLAN_EID_VENDOR_SPECIFIC; *buf++ = 7; /* len */ *buf++ = 0x00; /* Microsoft OUI 00:50:F2 */ *buf++ = 0x50; *buf++ = 0xf2; *buf++ = 2; /* WME */ *buf++ = 0; /* WME info */ *buf++ = 1; /* WME ver */ *buf++ = qosinfo; /* U-APSD no in use */ return buf; } void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt) { struct txq_info *txqi = to_txq_info(txq); u32 frag_cnt = 0, frag_bytes = 0; struct sk_buff *skb; skb_queue_walk(&txqi->frags, skb) { frag_cnt++; frag_bytes += skb->len; } if (frame_cnt) *frame_cnt = txqi->tin.backlog_packets + frag_cnt; if (byte_cnt) *byte_cnt = txqi->tin.backlog_bytes + frag_bytes; } EXPORT_SYMBOL(ieee80211_txq_get_depth); const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS] = { IEEE80211_WMM_IE_STA_QOSINFO_AC_VO, IEEE80211_WMM_IE_STA_QOSINFO_AC_VI, IEEE80211_WMM_IE_STA_QOSINFO_AC_BE, IEEE80211_WMM_IE_STA_QOSINFO_AC_BK }; u16 ieee80211_encode_usf(int listen_interval) { static const int listen_int_usf[] = { 1, 10, 1000, 10000 }; u16 ui, usf = 0; /* find greatest USF */ while (usf < IEEE80211_MAX_USF) { if (listen_interval % listen_int_usf[usf + 1]) break; usf += 1; } ui = listen_interval / listen_int_usf[usf]; /* error if there is a remainder. Should've been checked by user */ WARN_ON_ONCE(ui > IEEE80211_MAX_UI); listen_interval = FIELD_PREP(LISTEN_INT_USF, usf) | FIELD_PREP(LISTEN_INT_UI, ui); return (u16) listen_interval; } |
534 532 50 532 417 417 416 417 158 417 417 417 417 417 15 20 157 158 15 20 9 15 18 11 15 18 9 8 9 8 9 4 4 4 15 15 15 15 8 9 9 8 9 15 20 20 16 18 20 11 11 10 8 8 8 8 520 528 417 417 416 417 417 417 415 417 417 528 526 518 529 9 9 9 9 412 412 413 413 417 417 417 412 413 416 416 50 50 50 50 50 50 417 50 416 417 417 417 417 417 417 417 417 417 158 156 158 157 158 157 519 516 158 157 158 156 158 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 | // SPDX-License-Identifier: GPL-2.0 /* * /proc/sys support */ #include <linux/init.h> #include <linux/sysctl.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/printk.h> #include <linux/security.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/namei.h> #include <linux/mm.h> #include <linux/uio.h> #include <linux/module.h> #include <linux/bpf-cgroup.h> #include <linux/mount.h> #include <linux/kmemleak.h> #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; EXPORT_SYMBOL_GPL(sysctl_long_vals); /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { { } }; static bool is_empty_dir(struct ctl_table_header *head) { return head->ctl_table[0].child == sysctl_mount_point; } static void set_empty_dir(struct ctl_dir *dir) { dir->header.ctl_table[0].child = sysctl_mount_point; } static void clear_empty_dir(struct ctl_dir *dir) { dir->header.ctl_table[0].child = NULL; } void proc_sys_poll_notify(struct ctl_table_poll *poll) { if (!poll) return; atomic_inc(&poll->event); wake_up_interruptible(&poll->wait); } static struct ctl_table root_table[] = { { .procname = "", .mode = S_IFDIR|S_IRUGO|S_IXUGO, }, { } }; static struct ctl_table_root sysctl_table_root = { .default_set.dir.header = { {{.count = 1, .nreg = 1, .ctl_table = root_table }}, .ctl_table_arg = root_table, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }, }; static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); static void sysctl_print_dir(struct ctl_dir *dir) { if (dir->header.parent) sysctl_print_dir(dir->header.parent); pr_cont("%s/", dir->header.ctl_table[0].procname); } static int namecmp(const char *name1, int len1, const char *name2, int len2) { int cmp; cmp = memcmp(name1, name2, min(len1, len2)); if (cmp == 0) cmp = len1 - len2; return cmp; } /* Called under sysctl_lock */ static struct ctl_table *find_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; struct ctl_table *entry; struct rb_node *node = dir->root.rb_node; while (node) { struct ctl_node *ctl_node; const char *procname; int cmp; ctl_node = rb_entry(node, struct ctl_node, node); head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; procname = entry->procname; cmp = namecmp(name, namelen, procname, strlen(procname)); if (cmp < 0) node = node->rb_left; else if (cmp > 0) node = node->rb_right; else { *phead = head; return entry; } } return NULL; } static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; struct rb_node **p = &head->parent->root.rb_node; struct rb_node *parent = NULL; const char *name = entry->procname; int namelen = strlen(name); while (*p) { struct ctl_table_header *parent_head; struct ctl_table *parent_entry; struct ctl_node *parent_node; const char *parent_name; int cmp; parent = *p; parent_node = rb_entry(parent, struct ctl_node, node); parent_head = parent_node->header; parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; parent_name = parent_entry->procname; cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) p = &(*p)->rb_right; else { pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); pr_cont("/%s\n", entry->procname); return -EEXIST; } } rb_link_node(node, parent, p); rb_insert_color(node, &head->parent->root); return 0; } static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; rb_erase(node, &head->parent->root); } static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, struct ctl_node *node, struct ctl_table *table) { head->ctl_table = table; head->ctl_table_arg = table; head->used = 0; head->count = 1; head->nreg = 1; head->unregistering = NULL; head->root = root; head->set = set; head->parent = NULL; head->node = node; INIT_HLIST_HEAD(&head->inodes); if (node) { struct ctl_table *entry; for (entry = table; entry->procname; entry++, node++) node->header = head; } } static void erase_header(struct ctl_table_header *head) { struct ctl_table *entry; for (entry = head->ctl_table; entry->procname; entry++) erase_entry(head, entry); } static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { struct ctl_table *entry; int err; /* Is this a permanently empty directory? */ if (is_empty_dir(&dir->header)) return -EROFS; /* Am I creating a permanently empty directory? */ if (header->ctl_table == sysctl_mount_point) { if (!RB_EMPTY_ROOT(&dir->root)) return -EINVAL; set_empty_dir(dir); } dir->header.nreg++; header->parent = dir; err = insert_links(header); if (err) goto fail_links; for (entry = header->ctl_table; entry->procname; entry++) { err = insert_entry(header, entry); if (err) goto fail; } return 0; fail: erase_header(header); put_links(header); fail_links: if (header->ctl_table == sysctl_mount_point) clear_empty_dir(dir); header->parent = NULL; drop_sysctl_table(&dir->header); return err; } /* called under sysctl_lock */ static int use_table(struct ctl_table_header *p) { if (unlikely(p->unregistering)) return 0; p->used++; return 1; } /* called under sysctl_lock */ static void unuse_table(struct ctl_table_header *p) { if (!--p->used) if (unlikely(p->unregistering)) complete(p->unregistering); } static void proc_sys_invalidate_dcache(struct ctl_table_header *head) { proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } /* called under sysctl_lock, will reacquire if has to wait */ static void start_unregistering(struct ctl_table_header *p) { /* * if p->used is 0, nobody will ever touch that entry again; * we'll eliminate all paths to it before dropping sysctl_lock */ if (unlikely(p->used)) { struct completion wait; init_completion(&wait); p->unregistering = &wait; spin_unlock(&sysctl_lock); wait_for_completion(&wait); } else { /* anything non-NULL; we'll never dereference it */ p->unregistering = ERR_PTR(-EINVAL); spin_unlock(&sysctl_lock); } /* * Invalidate dentries for unregistered sysctls: namespaced sysctls * can have duplicate names and contaminate dcache very badly. */ proc_sys_invalidate_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. */ spin_lock(&sysctl_lock); erase_header(p); } static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) { BUG_ON(!head); spin_lock(&sysctl_lock); if (!use_table(head)) head = ERR_PTR(-ENOENT); spin_unlock(&sysctl_lock); return head; } static void sysctl_head_finish(struct ctl_table_header *head) { if (!head) return; spin_lock(&sysctl_lock); unuse_table(head); spin_unlock(&sysctl_lock); } static struct ctl_table_set * lookup_header_set(struct ctl_table_root *root) { struct ctl_table_set *set = &root->default_set; if (root->lookup) set = root->lookup(root); return set; } static struct ctl_table *lookup_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; struct ctl_table *entry; spin_lock(&sysctl_lock); entry = find_entry(&head, dir, name, namelen); if (entry && use_table(head)) *phead = head; else entry = NULL; spin_unlock(&sysctl_lock); return entry; } static struct ctl_node *first_usable_entry(struct rb_node *node) { struct ctl_node *ctl_node; for (;node; node = rb_next(node)) { ctl_node = rb_entry(node, struct ctl_node, node); if (use_table(ctl_node->header)) return ctl_node; } return NULL; } static void first_entry(struct ctl_dir *dir, struct ctl_table_header **phead, struct ctl_table **pentry) { struct ctl_table_header *head = NULL; struct ctl_table *entry = NULL; struct ctl_node *ctl_node; spin_lock(&sysctl_lock); ctl_node = first_usable_entry(rb_first(&dir->root)); spin_unlock(&sysctl_lock); if (ctl_node) { head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; } static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) { struct ctl_table_header *head = *phead; struct ctl_table *entry = *pentry; struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; spin_lock(&sysctl_lock); unuse_table(head); ctl_node = first_usable_entry(rb_next(&ctl_node->node)); spin_unlock(&sysctl_lock); head = NULL; if (ctl_node) { head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; } /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. */ static int test_perm(int mode, int op) { if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) mode >>= 6; else if (in_egroup_p(GLOBAL_ROOT_GID)) mode >>= 3; if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) return 0; return -EACCES; } static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) { struct ctl_table_root *root = head->root; int mode; if (root->permissions) mode = root->permissions(head, table); else mode = table->mode; return test_perm(mode, op); } static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { struct ctl_table_root *root = head->root; struct inode *inode; struct proc_inode *ei; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); ei = PROC_I(inode); spin_lock(&sysctl_lock); if (unlikely(head->unregistering)) { spin_unlock(&sysctl_lock); iput(inode); return ERR_PTR(-ENOENT); } ei->sysctl = head; ei->sysctl_entry = table; hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); head->count++; spin_unlock(&sysctl_lock); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; inode->i_op = &proc_sys_inode_operations; inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; if (is_empty_dir(head)) make_empty_dir_inode(inode); } inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; if (root->set_ownership) root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); return inode; } void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { spin_lock(&sysctl_lock); hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); if (!--head->count) kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); } static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; if (!head) head = &sysctl_table_root.default_set.dir.header; return sysctl_head_grab(head); } static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; const struct qstr *name = &dentry->d_name; struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); struct ctl_dir *ctl_dir; int ret; if (IS_ERR(head)) return ERR_CAST(head); ctl_dir = container_of(head, struct ctl_dir, header); p = lookup_entry(&h, ctl_dir, name->name, name->len); if (!p) goto out; if (S_ISLNK(p->mode)) { ret = sysctl_follow_link(&h, &p); err = ERR_PTR(ret); if (ret) goto out; } inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); if (IS_ERR(inode)) { err = ERR_CAST(inode); goto out; } d_set_d_op(dentry, &proc_sys_dentry_operations); err = d_splice_alias(inode, dentry); out: if (h) sysctl_head_finish(h); sysctl_head_finish(head); return err; } static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, int write) { struct inode *inode = file_inode(iocb->ki_filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; size_t count = iov_iter_count(iter); char *kbuf; ssize_t error; if (IS_ERR(head)) return PTR_ERR(head); /* * At this point we know that the sysctl was not unregistered * and won't be until we finish. */ error = -EPERM; if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) goto out; /* if that can happen at all, it should be -EINVAL, not -EISDIR */ error = -EINVAL; if (!table->proc_handler) goto out; /* don't even try if the size is too large */ error = -ENOMEM; if (count >= KMALLOC_MAX_SIZE) goto out; kbuf = kvzalloc(count + 1, GFP_KERNEL); if (!kbuf) goto out; if (write) { error = -EFAULT; if (!copy_from_iter_full(kbuf, count, iter)) goto out_free_buf; kbuf[count] = '\0'; } error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; /* careful: calling conventions are nasty here */ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; if (!write) { error = -EFAULT; if (copy_to_iter(kbuf, count, iter) < count) goto out_free_buf; } error = count; out_free_buf: kvfree(kbuf); out: sysctl_head_finish(head); return error; } static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter) { return proc_sys_call_handler(iocb, iter, 0); } static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) { return proc_sys_call_handler(iocb, iter, 1); } static int proc_sys_open(struct inode *inode, struct file *filp) { struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; /* sysctl was unregistered */ if (IS_ERR(head)) return PTR_ERR(head); if (table->poll) filp->private_data = proc_sys_poll_event(table->poll); sysctl_head_finish(head); return 0; } static __poll_t proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; __poll_t ret = DEFAULT_POLLMASK; unsigned long event; /* sysctl was unregistered */ if (IS_ERR(head)) return EPOLLERR | EPOLLHUP; if (!table->proc_handler) goto out; if (!table->poll) goto out; event = (unsigned long)filp->private_data; poll_wait(filp, &table->poll->wait, wait); if (event != atomic_read(&table->poll->event)) { filp->private_data = proc_sys_poll_event(table->poll); ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } out: sysctl_head_finish(head); return ret; } static bool proc_sys_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, struct ctl_table *table) { struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; struct qstr qname; ino_t ino = 0; unsigned type = DT_UNKNOWN; qname.name = table->procname; qname.len = strlen(table->procname); qname.hash = full_name_hash(dir, qname.name, qname.len); child = d_lookup(dir, &qname); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) return false; if (d_in_lookup(child)) { struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); if (IS_ERR(inode)) { d_lookup_done(child); dput(child); return false; } d_set_d_op(child, &proc_sys_dentry_operations); res = d_splice_alias(inode, child); d_lookup_done(child); if (unlikely(res)) { if (IS_ERR(res)) { dput(child); return false; } dput(child); child = res; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); return dir_emit(ctx, qname.name, qname.len, ino, type); } static bool proc_sys_link_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, struct ctl_table *table) { bool ret = true; head = sysctl_head_grab(head); if (IS_ERR(head)) return false; /* It is not an error if we can not follow the link ignore it */ if (sysctl_follow_link(&head, &table)) goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: sysctl_head_finish(head); return ret; } static int scan(struct ctl_table_header *head, struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { bool res; if ((*pos)++ < ctx->pos) return true; if (unlikely(S_ISLNK(table->mode))) res = proc_sys_link_fill_cache(file, ctx, head, table); else res = proc_sys_fill_cache(file, ctx, head, table); if (res) ctx->pos = *pos; return res; } static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; struct ctl_table *entry; struct ctl_dir *ctl_dir; unsigned long pos; if (IS_ERR(head)) return PTR_ERR(head); ctl_dir = container_of(head, struct ctl_dir, header); if (!dir_emit_dots(file, ctx)) goto out; pos = 2; for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { if (!scan(h, entry, &pos, file, ctx)) { sysctl_head_finish(h); break; } } out: sysctl_head_finish(head); return 0; } static int proc_sys_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { /* * sysctl entries that are not writeable, * are _NOT_ writeable, capabilities or not. */ struct ctl_table_header *head; struct ctl_table *table; int error; /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; head = grab_header(inode); if (IS_ERR(head)) return PTR_ERR(head); table = PROC_I(inode)->sysctl_entry; if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); sysctl_head_finish(head); return error; } static int proc_sys_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } static int proc_sys_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; if (IS_ERR(head)) return PTR_ERR(head); generic_fillattr(&init_user_ns, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; sysctl_head_finish(head); return 0; } static const struct file_operations proc_sys_file_operations = { .open = proc_sys_open, .poll = proc_sys_poll, .read_iter = proc_sys_read, .write_iter = proc_sys_write, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .llseek = default_llseek, }; static const struct file_operations proc_sys_dir_file_operations = { .read = generic_read_dir, .iterate_shared = proc_sys_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations proc_sys_inode_operations = { .permission = proc_sys_permission, .setattr = proc_sys_setattr, .getattr = proc_sys_getattr, }; static const struct inode_operations proc_sys_dir_operations = { .lookup = proc_sys_lookup, .permission = proc_sys_permission, .setattr = proc_sys_setattr, .getattr = proc_sys_getattr, }; static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; return !PROC_I(d_inode(dentry))->sysctl->unregistering; } static int proc_sys_delete(const struct dentry *dentry) { return !!PROC_I(d_inode(dentry))->sysctl->unregistering; } static int sysctl_is_seen(struct ctl_table_header *p) { struct ctl_table_set *set = p->set; int res; spin_lock(&sysctl_lock); if (p->unregistering) res = 0; else if (!set->is_seen) res = 1; else res = set->is_seen(set); spin_unlock(&sysctl_lock); return res; } static int proc_sys_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct ctl_table_header *head; struct inode *inode; /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ /* AV: can it, indeed? */ inode = d_inode_rcu(dentry); if (!inode) return 1; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; head = rcu_dereference(PROC_I(inode)->sysctl); return !head || !sysctl_is_seen(head); } static const struct dentry_operations proc_sys_dentry_operations = { .d_revalidate = proc_sys_revalidate, .d_delete = proc_sys_delete, .d_compare = proc_sys_compare, }; static struct ctl_dir *find_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; struct ctl_table *entry; entry = find_entry(&head, dir, name, namelen); if (!entry) return ERR_PTR(-ENOENT); if (!S_ISDIR(entry->mode)) return ERR_PTR(-ENOTDIR); return container_of(head, struct ctl_dir, header); } static struct ctl_dir *new_dir(struct ctl_table_set *set, const char *name, int namelen) { struct ctl_table *table; struct ctl_dir *new; struct ctl_node *node; char *new_name; new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + sizeof(struct ctl_table)*2 + namelen + 1, GFP_KERNEL); if (!new) return NULL; node = (struct ctl_node *)(new + 1); table = (struct ctl_table *)(node + 1); new_name = (char *)(table + 2); memcpy(new_name, name, namelen); new_name[namelen] = '\0'; table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; init_header(&new->header, set->dir.header.root, set, node, table); return new; } /** * get_subdir - find or create a subdir with the specified name. * @dir: Directory to create the subdirectory in * @name: The name of the subdirectory to find or create * @namelen: The length of name * * Takes a directory with an elevated reference count so we know that * if we drop the lock the directory will not go away. Upon success * the reference is moved from @dir to the returned subdirectory. * Upon error an error code is returned and the reference on @dir is * simply dropped. */ static struct ctl_dir *get_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_set *set = dir->header.set; struct ctl_dir *subdir, *new = NULL; int err; spin_lock(&sysctl_lock); subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; spin_unlock(&sysctl_lock); new = new_dir(set, name, namelen); spin_lock(&sysctl_lock); subdir = ERR_PTR(-ENOMEM); if (!new) goto failed; /* Was the subdir added while we dropped the lock? */ subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; /* Nope. Use the our freshly made directory entry. */ err = insert_header(dir, &new->header); subdir = ERR_PTR(err); if (err) goto failed; subdir = new; found: subdir->header.nreg++; failed: if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); pr_cont("/%*.*s %ld\n", namelen, namelen, name, PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); if (new) drop_sysctl_table(&new->header); spin_unlock(&sysctl_lock); return subdir; } static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) { struct ctl_dir *parent; const char *procname; if (!dir->header.parent) return &set->dir; parent = xlate_dir(set, dir->header.parent); if (IS_ERR(parent)) return parent; procname = dir->header.ctl_table[0].procname; return find_subdir(parent, procname, strlen(procname)); } static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_table **pentry) { struct ctl_table_header *head; struct ctl_table_root *root; struct ctl_table_set *set; struct ctl_table *entry; struct ctl_dir *dir; int ret; ret = 0; spin_lock(&sysctl_lock); root = (*pentry)->data; set = lookup_header_set(root); dir = xlate_dir(set, (*phead)->parent); if (IS_ERR(dir)) ret = PTR_ERR(dir); else { const char *procname = (*pentry)->procname; head = NULL; entry = find_entry(&head, dir, procname, strlen(procname)); ret = -ENOENT; if (entry && use_table(head)) { unuse_table(*phead); *phead = head; *pentry = entry; ret = 0; } } spin_unlock(&sysctl_lock); return ret; } static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("sysctl table check failed: %s/%s %pV\n", path, table->procname, &vaf); va_end(args); return -EINVAL; } static int sysctl_check_table_array(const char *path, struct ctl_table *table) { int err = 0; if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) err |= sysctl_err(path, table, "array not allowed"); } if (table->proc_handler == proc_dou8vec_minmax) { if (table->maxlen != sizeof(u8)) err |= sysctl_err(path, table, "array not allowed"); } return err; } static int sysctl_check_table(const char *path, struct ctl_table *table) { int err = 0; for (; table->procname; table++) { if (table->child) err |= sysctl_err(path, table, "Not a file"); if ((table->proc_handler == proc_dostring) || (table->proc_handler == proc_dointvec) || (table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dou8vec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || (table->proc_handler == proc_dointvec_userhz_jiffies) || (table->proc_handler == proc_dointvec_ms_jiffies) || (table->proc_handler == proc_doulongvec_minmax) || (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { if (!table->data) err |= sysctl_err(path, table, "No data"); if (!table->maxlen) err |= sysctl_err(path, table, "No maxlen"); else err |= sysctl_check_table_array(path, table); } if (!table->proc_handler) err |= sysctl_err(path, table, "No proc_handler"); if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) err |= sysctl_err(path, table, "bogus .mode 0%o", table->mode); } return err; } static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, struct ctl_table_root *link_root) { struct ctl_table *link_table, *entry, *link; struct ctl_table_header *links; struct ctl_node *node; char *link_name; int nr_entries, name_bytes; name_bytes = 0; nr_entries = 0; for (entry = table; entry->procname; entry++) { nr_entries++; name_bytes += strlen(entry->procname) + 1; } links = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*nr_entries + sizeof(struct ctl_table)*(nr_entries + 1) + name_bytes, GFP_KERNEL); if (!links) return NULL; node = (struct ctl_node *)(links + 1); link_table = (struct ctl_table *)(node + nr_entries); link_name = (char *)&link_table[nr_entries + 1]; for (link = link_table, entry = table; entry->procname; link++, entry++) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; link->data = link_root; link_name += len; } init_header(links, dir->header.root, dir->header.set, node, link_table); links->nreg = nr_entries; return links; } static bool get_links(struct ctl_dir *dir, struct ctl_table *table, struct ctl_table_root *link_root) { struct ctl_table_header *head; struct ctl_table *entry, *link; /* Are there links available for every entry in table? */ for (entry = table; entry->procname; entry++) { const char *procname = entry->procname; link = find_entry(&head, dir, procname, strlen(procname)); if (!link) return false; if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) continue; if (S_ISLNK(link->mode) && (link->data == link_root)) continue; return false; } /* The checks passed. Increase the registration count on the links */ for (entry = table; entry->procname; entry++) { const char *procname = entry->procname; link = find_entry(&head, dir, procname, strlen(procname)); head->nreg++; } return true; } static int insert_links(struct ctl_table_header *head) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_dir *core_parent = NULL; struct ctl_table_header *links; int err; if (head->set == root_set) return 0; core_parent = xlate_dir(root_set, head->parent); if (IS_ERR(core_parent)) return 0; if (get_links(core_parent, head->ctl_table, head->root)) return 0; core_parent->header.nreg++; spin_unlock(&sysctl_lock); links = new_links(core_parent, head->ctl_table, head->root); spin_lock(&sysctl_lock); err = -ENOMEM; if (!links) goto out; err = 0; if (get_links(core_parent, head->ctl_table, head->root)) { kfree(links); goto out; } err = insert_header(core_parent, links); if (err) kfree(links); out: drop_sysctl_table(&core_parent->header); return err; } /** * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. * @table: the top-level table structure * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * The members of the &struct ctl_table structure are used as follows: * * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file * * data - a pointer to data for use by proc_handler * * maxlen - the maximum size in bytes of the data * * mode - the file permissions for the /proc/sys file * * child - must be %NULL. * * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines * * Leaf nodes in the sysctl tree will be represented by a single file * under /proc; non-leaf nodes will be represented by directories. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - * * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() * * It is the handler's job to read the input buffer from user memory * and process it. The handler should return 0 on success. * * This routine returns %NULL on a failure to register, and a pointer * to the table header on success. */ struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, const char *path, struct ctl_table *table) { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; const char *name, *nextname; struct ctl_dir *dir; struct ctl_table *entry; struct ctl_node *node; int nr_entries = 0; for (entry = table; entry->procname; entry++) nr_entries++; header = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); if (!header) return NULL; node = (struct ctl_node *)(header + 1); init_header(header, root, set, node, table); if (sysctl_check_table(path, table)) goto fail; spin_lock(&sysctl_lock); dir = &set->dir; /* Reference moved down the diretory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); /* Find the directory for the ctl_table */ for (name = path; name; name = nextname) { int namelen; nextname = strchr(name, '/'); if (nextname) { namelen = nextname - name; nextname++; } else { namelen = strlen(name); } if (namelen == 0) continue; dir = get_subdir(dir, name, namelen); if (IS_ERR(dir)) goto fail; } spin_lock(&sysctl_lock); if (insert_header(dir, header)) goto fail_put_dir_locked; drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); return header; fail_put_dir_locked: drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); fail: kfree(header); dump_stack(); return NULL; } /** * register_sysctl - register a sysctl table * @path: The path to the directory the sysctl table is in. * @table: the table structure * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_table for more details. */ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) { return __register_sysctl_table(&sysctl_table_root.default_set, path, table); } EXPORT_SYMBOL(register_sysctl); /** * __register_sysctl_init() - register sysctl table to path * @path: path name for sysctl base * @table: This is the sysctl table that needs to be registered to the path * @table_name: The name of sysctl table, only used for log printing when * registration fails * * The sysctl interface is used by userspace to query or modify at runtime * a predefined value set on a variable. These variables however have default * values pre-set. Code which depends on these variables will always work even * if register_sysctl() fails. If register_sysctl() fails you'd just loose the * ability to query or modify the sysctls dynamically at run time. Chances of * register_sysctl() failing on init are extremely low, and so for both reasons * this function does not return any error as it is used by initialization code. * * Context: Can only be called after your respective sysctl base path has been * registered. So for instance, most base directories are registered early on * init before init levels are processed through proc_sys_init() and * sysctl_init(). */ void __init __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name) { struct ctl_table_header *hdr = register_sysctl(path, table); if (unlikely(!hdr)) { pr_err("failed when register_sysctl %s to %s\n", table_name, path); return; } kmemleak_not_leak(hdr); } static char *append_path(const char *path, char *pos, const char *name) { int namelen; namelen = strlen(name); if (((pos - path) + namelen + 2) >= PATH_MAX) return NULL; memcpy(pos, name, namelen); pos[namelen] = '/'; pos[namelen + 1] = '\0'; pos += namelen + 1; return pos; } static int count_subheaders(struct ctl_table *table) { int has_files = 0; int nr_subheaders = 0; struct ctl_table *entry; /* special case: no directory and empty directory */ if (!table || !table->procname) return 1; for (entry = table; entry->procname; entry++) { if (entry->child) nr_subheaders += count_subheaders(entry->child); else has_files = 1; } return nr_subheaders + has_files; } static int register_leaf_sysctl_tables(const char *path, char *pos, struct ctl_table_header ***subheader, struct ctl_table_set *set, struct ctl_table *table) { struct ctl_table *ctl_table_arg = NULL; struct ctl_table *entry, *files; int nr_files = 0; int nr_dirs = 0; int err = -ENOMEM; for (entry = table; entry->procname; entry++) { if (entry->child) nr_dirs++; else nr_files++; } files = table; /* If there are mixed files and directories we need a new table */ if (nr_dirs && nr_files) { struct ctl_table *new; files = kcalloc(nr_files + 1, sizeof(struct ctl_table), GFP_KERNEL); if (!files) goto out; ctl_table_arg = files; for (new = files, entry = table; entry->procname; entry++) { if (entry->child) continue; *new = *entry; new++; } } /* Register everything except a directory full of subdirectories */ if (nr_files || !nr_dirs) { struct ctl_table_header *header; header = __register_sysctl_table(set, path, files); if (!header) { kfree(ctl_table_arg); goto out; } /* Remember if we need to free the file table */ header->ctl_table_arg = ctl_table_arg; **subheader = header; (*subheader)++; } /* Recurse into the subdirectories. */ for (entry = table; entry->procname; entry++) { char *child_pos; if (!entry->child) continue; err = -ENAMETOOLONG; child_pos = append_path(path, pos, entry->procname); if (!child_pos) goto out; err = register_leaf_sysctl_tables(path, child_pos, subheader, set, entry->child); pos[0] = '\0'; if (err) goto out; } err = 0; out: /* On failure our caller will unregister all registered subheaders */ return err; } /** * __register_sysctl_paths - register a sysctl table hierarchy * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. * @table: the top-level table structure * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_table for more details. */ struct ctl_table_header *__register_sysctl_paths( struct ctl_table_set *set, const struct ctl_path *path, struct ctl_table *table) { struct ctl_table *ctl_table_arg = table; int nr_subheaders = count_subheaders(table); struct ctl_table_header *header = NULL, **subheaders, **subheader; const struct ctl_path *component; char *new_path, *pos; pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); if (!new_path) return NULL; pos[0] = '\0'; for (component = path; component->procname; component++) { pos = append_path(new_path, pos, component->procname); if (!pos) goto out; } while (table->procname && table->child && !table[1].procname) { pos = append_path(new_path, pos, table->procname); if (!pos) goto out; table = table->child; } if (nr_subheaders == 1) { header = __register_sysctl_table(set, new_path, table); if (header) header->ctl_table_arg = ctl_table_arg; } else { header = kzalloc(sizeof(*header) + sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); if (!header) goto out; subheaders = (struct ctl_table_header **) (header + 1); subheader = subheaders; header->ctl_table_arg = ctl_table_arg; if (register_leaf_sysctl_tables(new_path, pos, &subheader, set, table)) goto err_register_leaves; } out: kfree(new_path); return header; err_register_leaves: while (subheader > subheaders) { struct ctl_table_header *subh = *(--subheader); struct ctl_table *table = subh->ctl_table_arg; unregister_sysctl_table(subh); kfree(table); } kfree(header); header = NULL; goto out; } /** * register_sysctl_paths - register a sysctl table hierarchy * @path: The path to the directory the sysctl table is in. * @table: the top-level table structure * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_paths for more details. */ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, struct ctl_table *table) { return __register_sysctl_paths(&sysctl_table_root.default_set, path, table); } EXPORT_SYMBOL(register_sysctl_paths); /** * register_sysctl_table - register a sysctl table hierarchy * @table: the top-level table structure * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See register_sysctl_paths for more details. */ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) { static const struct ctl_path null_path[] = { {} }; return register_sysctl_paths(null_path, table); } EXPORT_SYMBOL(register_sysctl_table); static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_table_root *root = header->root; struct ctl_dir *parent = header->parent; struct ctl_dir *core_parent; struct ctl_table *entry; if (header->set == root_set) return; core_parent = xlate_dir(root_set, parent); if (IS_ERR(core_parent)) return; for (entry = header->ctl_table; entry->procname; entry++) { struct ctl_table_header *link_head; struct ctl_table *link; const char *name = entry->procname; link = find_entry(&link_head, core_parent, name, strlen(name)); if (link && ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || (S_ISLNK(link->mode) && (link->data == root)))) { drop_sysctl_table(link_head); } else { pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); pr_cont("/%s\n", name); } } } static void drop_sysctl_table(struct ctl_table_header *header) { struct ctl_dir *parent = header->parent; if (--header->nreg) return; if (parent) { put_links(header); start_unregistering(header); } if (!--header->count) kfree_rcu(header, rcu); if (parent) drop_sysctl_table(&parent->header); } /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl_table * * Unregisters the sysctl table and all children. proc entries may not * actually be removed until they are no longer used by anyone. */ void unregister_sysctl_table(struct ctl_table_header * header) { int nr_subheaders; might_sleep(); if (header == NULL) return; nr_subheaders = count_subheaders(header->ctl_table_arg); if (unlikely(nr_subheaders > 1)) { struct ctl_table_header **subheaders; int i; subheaders = (struct ctl_table_header **)(header + 1); for (i = nr_subheaders -1; i >= 0; i--) { struct ctl_table_header *subh = subheaders[i]; struct ctl_table *table = subh->ctl_table_arg; unregister_sysctl_table(subh); kfree(table); } kfree(header); return; } spin_lock(&sysctl_lock); drop_sysctl_table(header); spin_unlock(&sysctl_lock); } EXPORT_SYMBOL(unregister_sysctl_table); void setup_sysctl_set(struct ctl_table_set *set, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)) { memset(set, 0, sizeof(*set)); set->is_seen = is_seen; init_header(&set->dir.header, root, set, NULL, root_table); } void retire_sysctl_set(struct ctl_table_set *set) { WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); } int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; return sysctl_init(); } struct sysctl_alias { const char *kernel_param; const char *sysctl_param; }; /* * Historically some settings had both sysctl and a command line parameter. * With the generic sysctl. parameter support, we can handle them at a single * place and only keep the historical name for compatibility. This is not meant * to add brand new aliases. When adding existing aliases, consider whether * the possibly different moment of changing the value (e.g. from early_param * to the moment do_sysctl_args() is called) is an issue for the specific * parameter. */ static const struct sysctl_alias sysctl_aliases[] = { {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, {"hung_task_panic", "kernel.hung_task_panic" }, {"numa_zonelist_order", "vm.numa_zonelist_order" }, {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, { } }; static const char *sysctl_find_alias(char *param) { const struct sysctl_alias *alias; for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { if (strcmp(alias->kernel_param, param) == 0) return alias->sysctl_param; } return NULL; } bool sysctl_is_alias(char *param) { const char *alias = sysctl_find_alias(param); return alias != NULL; } /* Set sysctl value passed on kernel command line. */ static int process_sysctl_arg(char *param, char *val, const char *unused, void *arg) { char *path; struct vfsmount **proc_mnt = arg; struct file_system_type *proc_fs_type; struct file *file; int len; int err; loff_t pos = 0; ssize_t wret; if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; if (param[0] != '/' && param[0] != '.') return 0; param++; } else { param = (char *) sysctl_find_alias(param); if (!param) return 0; } if (!val) return -EINVAL; len = strlen(val); if (len == 0) return -EINVAL; /* * To set sysctl options, we use a temporary mount of proc, look up the * respective sys/ file and write to it. To avoid mounting it when no * options were given, we mount it only when the first sysctl option is * found. Why not a persistent mount? There are problems with a * persistent mount of proc in that it forces userspace not to use any * proc mount options. */ if (!*proc_mnt) { proc_fs_type = get_fs_type("proc"); if (!proc_fs_type) { pr_err("Failed to find procfs to set sysctl from command line\n"); return 0; } *proc_mnt = kern_mount(proc_fs_type); put_filesystem(proc_fs_type); if (IS_ERR(*proc_mnt)) { pr_err("Failed to mount procfs to set sysctl from command line\n"); return 0; } } path = kasprintf(GFP_KERNEL, "sys/%s", param); if (!path) panic("%s: Failed to allocate path for %s\n", __func__, param); strreplace(path, '.', '/'); file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); if (IS_ERR(file)) { err = PTR_ERR(file); if (err == -ENOENT) pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", param, val); else if (err == -EACCES) pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", param, val); else pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", file, param, val); goto out; } wret = kernel_write(file, val, len, &pos); if (wret < 0) { err = wret; if (err == -EINVAL) pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", param, val); else pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", ERR_PTR(err), param, val); } else if (wret != len) { pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", wret, len, path, param, val); } err = filp_close(file, NULL); if (err) pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", ERR_PTR(err), param, val); out: kfree(path); return 0; } void do_sysctl_args(void) { char *command_line; struct vfsmount *proc_mnt = NULL; command_line = kstrdup(saved_command_line, GFP_KERNEL); if (!command_line) panic("%s: Failed to allocate copy of command line\n", __func__); parse_args("Setting sysctl args", command_line, NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); if (proc_mnt) kern_unmount(proc_mnt); kfree(command_line); } |
97 66 97 8 89 711 3421 521 207 680 5970 853 7 3616 3619 611 789 964 82 81 82 25 13 13 1 1 10 30 199 31 174 19 86 635 189 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 | /* SPDX-License-Identifier: GPL-2.0 */ /* * net/dst.h Protocol independent destination cache definitions. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * */ #ifndef _NET_DST_H #define _NET_DST_H #include <net/dst_ops.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> #include <linux/bug.h> #include <linux/jiffies.h> #include <linux/refcount.h> #include <net/neighbour.h> #include <asm/processor.h> #include <linux/indirect_call_wrapper.h> struct sk_buff; struct dst_entry { struct net_device *dev; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else void *__pad1; #endif int (*input)(struct sk_buff *); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); unsigned short flags; #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 #define DST_NOCOUNT 0x0008 #define DST_FAKE_RTABLE 0x0010 #define DST_XFRM_TUNNEL 0x0020 #define DST_XFRM_QUEUE 0x0040 #define DST_METADATA 0x0080 /* A non-zero value of dst->obsolete forces by-hand validation * of the route entry. Positive values are set by the generic * dst layer to indicate that the entry has been forcefully * destroyed. * * Negative values are used by the implementation layer code to * force invocation of the dst_ops->check() method. */ short obsolete; #define DST_OBSOLETE_NONE 0 #define DST_OBSOLETE_DEAD 2 #define DST_OBSOLETE_FORCE_CHK -1 #define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ /* * __refcnt wants to be on a different cache line from * input/output/ops or performance tanks badly */ #ifdef CONFIG_64BIT atomic_t __refcnt; /* 64-bit offset 64 */ #endif int __use; unsigned long lastuse; struct lwtunnel_state *lwtstate; struct rcu_head rcu_head; short error; short __pad; __u32 tclassid; #ifndef CONFIG_64BIT atomic_t __refcnt; /* 32-bit offset 64 */ #endif }; struct dst_metrics { u32 metrics[RTAX_MAX]; refcount_t refcnt; } __aligned(4); /* Low pointer bits contain DST_METRICS_FLAGS */ extern const struct dst_metrics dst_default_metrics; u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); #define DST_METRICS_READ_ONLY 0x1UL #define DST_METRICS_REFCOUNTED 0x2UL #define DST_METRICS_FLAGS 0x3UL #define __DST_METRICS_PTR(Y) \ ((u32 *)((Y) & ~DST_METRICS_FLAGS)) #define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics) static inline bool dst_metrics_read_only(const struct dst_entry *dst) { return dst->_metrics & DST_METRICS_READ_ONLY; } void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old); static inline void dst_destroy_metrics_generic(struct dst_entry *dst) { unsigned long val = dst->_metrics; if (!(val & DST_METRICS_READ_ONLY)) __dst_destroy_metrics_generic(dst, val); } static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst) { unsigned long p = dst->_metrics; BUG_ON(!p); if (p & DST_METRICS_READ_ONLY) return dst->ops->cow_metrics(dst, p); return __DST_METRICS_PTR(p); } /* This may only be invoked before the entry has reached global * visibility. */ static inline void dst_init_metrics(struct dst_entry *dst, const u32 *src_metrics, bool read_only) { dst->_metrics = ((unsigned long) src_metrics) | (read_only ? DST_METRICS_READ_ONLY : 0); } static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src) { u32 *dst_metrics = dst_metrics_write_ptr(dest); if (dst_metrics) { u32 *src_metrics = DST_METRICS_PTR(src); memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32)); } } static inline u32 *dst_metrics_ptr(struct dst_entry *dst) { return DST_METRICS_PTR(dst); } static inline u32 dst_metric_raw(const struct dst_entry *dst, const int metric) { u32 *p = DST_METRICS_PTR(dst); return p[metric-1]; } static inline u32 dst_metric(const struct dst_entry *dst, const int metric) { WARN_ON_ONCE(metric == RTAX_HOPLIMIT || metric == RTAX_ADVMSS || metric == RTAX_MTU); return dst_metric_raw(dst, metric); } static inline u32 dst_metric_advmss(const struct dst_entry *dst) { u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS); if (!advmss) advmss = dst->ops->default_advmss(dst); return advmss; } static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) { u32 *p = dst_metrics_write_ptr(dst); if (p) p[metric-1] = val; } /* Kernel-internal feature bits that are unallocated in user space. */ #define DST_FEATURE_ECN_CA (1U << 31) #define DST_FEATURE_MASK (DST_FEATURE_ECN_CA) #define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN) static inline u32 dst_feature(const struct dst_entry *dst, u32 feature) { return dst_metric(dst, RTAX_FEATURES) & feature; } INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *)); INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *)); static inline u32 dst_mtu(const struct dst_entry *dst) { return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst); } /* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric) { return msecs_to_jiffies(dst_metric(dst, metric)); } static inline u32 dst_allfrag(const struct dst_entry *dst) { int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG); return ret; } static inline int dst_metric_locked(const struct dst_entry *dst, int metric) { return dst_metric(dst, RTAX_LOCK) & (1 << metric); } static inline void dst_hold(struct dst_entry *dst) { /* * If your kernel compilation stops here, please check * the placement of __refcnt in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); } static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { if (unlikely(time != dst->lastuse)) { dst->__use++; dst->lastuse = time; } } static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) dst_hold(dst); return dst; } void dst_release(struct dst_entry *dst); void dst_release_immediate(struct dst_entry *dst); static inline void refdst_drop(unsigned long refdst) { if (!(refdst & SKB_DST_NOREF)) dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK)); } /** * skb_dst_drop - drops skb dst * @skb: buffer * * Drops dst reference count if a reference was taken. */ static inline void skb_dst_drop(struct sk_buff *skb) { if (skb->_skb_refdst) { refdst_drop(skb->_skb_refdst); skb->_skb_refdst = 0UL; } } static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst) { nskb->slow_gro |= !!refdst; nskb->_skb_refdst = refdst; if (!(nskb->_skb_refdst & SKB_DST_NOREF)) dst_clone(skb_dst(nskb)); } static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb) { __skb_dst_copy(nskb, oskb->_skb_refdst); } /** * dst_hold_safe - Take a reference on a dst if possible * @dst: pointer to dst entry * * This helper returns false if it could not safely * take a reference on a dst. */ static inline bool dst_hold_safe(struct dst_entry *dst) { return atomic_inc_not_zero(&dst->__refcnt); } /** * skb_dst_force - makes sure skb dst is refcounted * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. * Returns true if dst is refcounted. */ static inline bool skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); WARN_ON(!rcu_read_lock_held()); if (!dst_hold_safe(dst)) dst = NULL; skb->_skb_refdst = (unsigned long)dst; skb->slow_gro |= !!dst; } return skb->_skb_refdst != 0UL; } /** * __skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups. (no accounting done) */ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { skb->dev = dev; /* * Clear hash so that we can recalulate the hash for the * encapsulated packet, unless we have already determine the hash * over the L4 4-tuple. */ skb_clear_hash_if_not_l4(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, !net_eq(net, dev_net(dev))); } /** * skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups, and perform accounting. * Note: this accounting is not SMP safe. */ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { DEV_STATS_INC(dev, rx_packets); DEV_STATS_ADD(dev, rx_bytes, skb->len); __skb_tunnel_rx(skb, dev, net); } static inline u32 dst_tclassid(const struct sk_buff *skb) { #ifdef CONFIG_IP_ROUTE_CLASSID const struct dst_entry *dst; dst = skb_dst(skb); if (dst) return dst->tclassid; #endif return 0; } int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static inline int dst_discard(struct sk_buff *skb) { return dst_discard_out(&init_net, skb->sk, skb); } void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags); struct dst_entry *dst_destroy(struct dst_entry *dst); void dst_dev_put(struct dst_entry *dst); static inline void dst_confirm(struct dst_entry *dst) { } static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) { struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr); return IS_ERR(n) ? NULL : n; } static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { struct neighbour *n; if (WARN_ON_ONCE(!dst->ops->neigh_lookup)) return NULL; n = dst->ops->neigh_lookup(dst, skb, NULL); return IS_ERR(n) ? NULL : n; } static inline void dst_confirm_neigh(const struct dst_entry *dst, const void *daddr) { if (dst->ops->confirm_neigh) dst->ops->confirm_neigh(dst, daddr); } static inline void dst_link_failure(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops && dst->ops->link_failure) dst->ops->link_failure(skb); } static inline void dst_set_expires(struct dst_entry *dst, int timeout) { unsigned long expires = jiffies + timeout; if (expires == 0) expires = 1; if (dst->expires == 0 || time_before(expires, dst->expires)) dst->expires = expires; } static inline unsigned int dst_dev_overhead(struct dst_entry *dst, struct sk_buff *skb) { if (likely(dst)) return LL_RESERVED_SPACE(dst->dev); return skb->mac_len; } INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, struct sk_buff *)); /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return INDIRECT_CALL_INET(skb_dst(skb)->output, ip6_output, ip_output, net, sk, skb); } INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { return INDIRECT_CALL_INET(skb_dst(skb)->input, ip6_input, ip_local_deliver, skb); } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { if (dst->obsolete) dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie); return dst; } /* Flags for xfrm_lookup flags argument. */ enum { XFRM_LOOKUP_ICMP = 1 << 0, XFRM_LOOKUP_QUEUE = 1 << 1, XFRM_LOOKUP_KEEP_DST_REF = 1 << 2, }; struct flowi; #ifndef CONFIG_XFRM static inline struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return dst_orig; } static inline struct dst_entry * xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { return dst_orig; } static inline struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return dst_orig; } static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) { return NULL; } #else struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id); struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); /* skb attached with this dst needs transformation if dst->xfrm is valid */ static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) { return dst->xfrm; } #endif static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, true); } /* update dst pmtu but not do neighbor confirm */ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old); struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); unsigned int dst_blackhole_mtu(const struct dst_entry *dst); #endif /* _NET_DST_H */ |
16 1 2 14 15 6 6 1 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 | /* * Routines to compress and uncompress tcp packets (for transmission * over low speed serial lines). * * Copyright (c) 1989 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms are permitted * provided that the above copyright notice and this paragraph are * duplicated in all such forms and that any documentation, * advertising materials, and other materials related to such * distribution and use acknowledge that the software was developed * by the University of California, Berkeley. The name of the * University may not be used to endorse or promote products derived * from this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Van Jacobson (van@helios.ee.lbl.gov), Dec 31, 1989: * - Initial distribution. * * * modified for KA9Q Internet Software Package by * Katie Stevens (dkstevens@ucdavis.edu) * University of California, Davis * Computing Services * - 01-31-90 initial adaptation (from 1.19) * PPP.05 02-15-90 [ks] * PPP.08 05-02-90 [ks] use PPP protocol field to signal compression * PPP.15 09-90 [ks] improve mbuf handling * PPP.16 11-02 [karn] substantially rewritten to use NOS facilities * * - Feb 1991 Bill_Simpson@um.cc.umich.edu * variable number of conversation slots * allow zero or one slots * separate routines * status display * - Jul 1994 Dmitry Gorodchanin * Fixes for memory leaks. * - Oct 1994 Dmitry Gorodchanin * Modularization. * - Jan 1995 Bjorn Ekwall * Use ip_fast_csum from ip.h * - July 1995 Christos A. Polyzols * Spotted bug in tcp option checking * * * This module is a difficult issue. It's clearly inet code but it's also clearly * driver code belonging close to PPP and SLIP */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <net/slhc_vj.h> #ifdef CONFIG_INET /* Entire module is for IP only */ #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/termios.h> #include <linux/in.h> #include <linux/fcntl.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/icmp.h> #include <net/tcp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <net/checksum.h> #include <asm/unaligned.h> static unsigned char *encode(unsigned char *cp, unsigned short n); static long decode(unsigned char **cpp); static unsigned char * put16(unsigned char *cp, unsigned short x); static unsigned short pull16(unsigned char **cpp); /* Allocate compression data structure * slots must be in range 0 to 255 (zero meaning no compression) * Returns pointer to structure or ERR_PTR() on error. */ struct slcompress * slhc_init(int rslots, int tslots) { short i; struct cstate *ts; struct slcompress *comp; if (rslots < 0 || rslots > 255 || tslots < 0 || tslots > 255) return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct slcompress), GFP_KERNEL); if (! comp) goto out_fail; if (rslots > 0) { size_t rsize = rslots * sizeof(struct cstate); comp->rstate = kzalloc(rsize, GFP_KERNEL); if (! comp->rstate) goto out_free; comp->rslot_limit = rslots - 1; } if (tslots > 0) { size_t tsize = tslots * sizeof(struct cstate); comp->tstate = kzalloc(tsize, GFP_KERNEL); if (! comp->tstate) goto out_free2; comp->tslot_limit = tslots - 1; } comp->xmit_oldest = 0; comp->xmit_current = 255; comp->recv_current = 255; /* * don't accept any packets with implicit index until we get * one with an explicit index. Otherwise the uncompress code * will try to use connection 255, which is almost certainly * out of range */ comp->flags |= SLF_TOSS; if ( tslots > 0 ) { ts = comp->tstate; for(i = comp->tslot_limit; i > 0; --i){ ts[i].cs_this = i; ts[i].next = &(ts[i - 1]); } ts[0].next = &(ts[comp->tslot_limit]); ts[0].cs_this = 0; } return comp; out_free2: kfree(comp->rstate); out_free: kfree(comp); out_fail: return ERR_PTR(-ENOMEM); } /* Free a compression data structure */ void slhc_free(struct slcompress *comp) { if ( IS_ERR_OR_NULL(comp) ) return; if ( comp->tstate != NULLSLSTATE ) kfree( comp->tstate ); if ( comp->rstate != NULLSLSTATE ) kfree( comp->rstate ); kfree( comp ); } /* Put a short in host order into a char array in network order */ static inline unsigned char * put16(unsigned char *cp, unsigned short x) { *cp++ = x >> 8; *cp++ = x; return cp; } /* Encode a number */ static unsigned char * encode(unsigned char *cp, unsigned short n) { if(n >= 256 || n == 0){ *cp++ = 0; cp = put16(cp,n); } else { *cp++ = n; } return cp; } /* Pull a 16-bit integer in host order from buffer in network byte order */ static unsigned short pull16(unsigned char **cpp) { short rval; rval = *(*cpp)++; rval <<= 8; rval |= *(*cpp)++; return rval; } /* Decode a number */ static long decode(unsigned char **cpp) { int x; x = *(*cpp)++; if(x == 0){ return pull16(cpp) & 0xffff; /* pull16 returns -1 on error */ } else { return x & 0xff; /* -1 if PULLCHAR returned error */ } } /* * icp and isize are the original packet. * ocp is a place to put a copy if necessary. * cpp is initially a pointer to icp. If the copy is used, * change it to ocp. */ int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize, unsigned char *ocp, unsigned char **cpp, int compress_cid) { struct cstate *ocs = &(comp->tstate[comp->xmit_oldest]); struct cstate *lcs = ocs; struct cstate *cs = lcs->next; unsigned long deltaS, deltaA; short changes = 0; int nlen, hlen; unsigned char new_seq[16]; unsigned char *cp = new_seq; struct iphdr *ip; struct tcphdr *th, *oth; __sum16 csum; /* * Don't play with runt packets. */ if(isize<sizeof(struct iphdr)) return isize; ip = (struct iphdr *) icp; if (ip->version != 4 || ip->ihl < 5) return isize; /* Bail if this packet isn't TCP, or is an IP fragment */ if (ip->protocol != IPPROTO_TCP || (ntohs(ip->frag_off) & 0x3fff)) { /* Send as regular IP */ if(ip->protocol != IPPROTO_TCP) comp->sls_o_nontcp++; else comp->sls_o_tcp++; return isize; } nlen = ip->ihl * 4; if (isize < nlen + sizeof(*th)) return isize; th = (struct tcphdr *)(icp + nlen); if (th->doff < sizeof(struct tcphdr) / 4) return isize; hlen = nlen + th->doff * 4; /* Bail if the TCP packet isn't `compressible' (i.e., ACK isn't set or * some other control bit is set). Also uncompressible if * it's a runt. */ if(hlen > isize || th->syn || th->fin || th->rst || ! (th->ack)){ /* TCP connection stuff; send as regular IP */ comp->sls_o_tcp++; return isize; } /* * Packet is compressible -- we're going to send either a * COMPRESSED_TCP or UNCOMPRESSED_TCP packet. Either way, * we need to locate (or create) the connection state. * * States are kept in a circularly linked list with * xmit_oldest pointing to the end of the list. The * list is kept in lru order by moving a state to the * head of the list whenever it is referenced. Since * the list is short and, empirically, the connection * we want is almost always near the front, we locate * states via linear search. If we don't find a state * for the datagram, the oldest state is (re-)used. */ for ( ; ; ) { if( ip->saddr == cs->cs_ip.saddr && ip->daddr == cs->cs_ip.daddr && th->source == cs->cs_tcp.source && th->dest == cs->cs_tcp.dest) goto found; /* if current equal oldest, at end of list */ if ( cs == ocs ) break; lcs = cs; cs = cs->next; comp->sls_o_searches++; } /* * Didn't find it -- re-use oldest cstate. Send an * uncompressed packet that tells the other side what * connection number we're using for this conversation. * * Note that since the state list is circular, the oldest * state points to the newest and we only need to set * xmit_oldest to update the lru linkage. */ comp->sls_o_misses++; comp->xmit_oldest = lcs->cs_this; goto uncompressed; found: /* * Found it -- move to the front on the connection list. */ if(lcs == ocs) { /* found at most recently used */ } else if (cs == ocs) { /* found at least recently used */ comp->xmit_oldest = lcs->cs_this; } else { /* more than 2 elements */ lcs->next = cs->next; cs->next = ocs->next; ocs->next = cs; } /* * Make sure that only what we expect to change changed. * Check the following: * IP protocol version, header length & type of service. * The "Don't fragment" bit. * The time-to-live field. * The TCP header length. * IP options, if any. * TCP options, if any. * If any of these things are different between the previous & * current datagram, we send the current datagram `uncompressed'. */ oth = &cs->cs_tcp; if(ip->version != cs->cs_ip.version || ip->ihl != cs->cs_ip.ihl || ip->tos != cs->cs_ip.tos || (ip->frag_off & htons(0x4000)) != (cs->cs_ip.frag_off & htons(0x4000)) || ip->ttl != cs->cs_ip.ttl || th->doff != cs->cs_tcp.doff || (ip->ihl > 5 && memcmp(ip+1,cs->cs_ipopt,((ip->ihl)-5)*4) != 0) || (th->doff > 5 && memcmp(th+1,cs->cs_tcpopt,((th->doff)-5)*4) != 0)){ goto uncompressed; } /* * Figure out which of the changing fields changed. The * receiver expects changes in the order: urgent, window, * ack, seq (the order minimizes the number of temporaries * needed in this section of code). */ if(th->urg){ deltaS = ntohs(th->urg_ptr); cp = encode(cp,deltaS); changes |= NEW_U; } else if(th->urg_ptr != oth->urg_ptr){ /* argh! URG not set but urp changed -- a sensible * implementation should never do this but RFC793 * doesn't prohibit the change so we have to deal * with it. */ goto uncompressed; } if((deltaS = ntohs(th->window) - ntohs(oth->window)) != 0){ cp = encode(cp,deltaS); changes |= NEW_W; } if((deltaA = ntohl(th->ack_seq) - ntohl(oth->ack_seq)) != 0L){ if(deltaA > 0x0000ffff) goto uncompressed; cp = encode(cp,deltaA); changes |= NEW_A; } if((deltaS = ntohl(th->seq) - ntohl(oth->seq)) != 0L){ if(deltaS > 0x0000ffff) goto uncompressed; cp = encode(cp,deltaS); changes |= NEW_S; } switch(changes){ case 0: /* Nothing changed. If this packet contains data and the * last one didn't, this is probably a data packet following * an ack (normal on an interactive connection) and we send * it compressed. Otherwise it's probably a retransmit, * retransmitted ack or window probe. Send it uncompressed * in case the other side missed the compressed version. */ if(ip->tot_len != cs->cs_ip.tot_len && ntohs(cs->cs_ip.tot_len) == hlen) break; goto uncompressed; case SPECIAL_I: case SPECIAL_D: /* actual changes match one of our special case encodings -- * send packet uncompressed. */ goto uncompressed; case NEW_S|NEW_A: if(deltaS == deltaA && deltaS == ntohs(cs->cs_ip.tot_len) - hlen){ /* special case for echoed terminal traffic */ changes = SPECIAL_I; cp = new_seq; } break; case NEW_S: if(deltaS == ntohs(cs->cs_ip.tot_len) - hlen){ /* special case for data xfer */ changes = SPECIAL_D; cp = new_seq; } break; } deltaS = ntohs(ip->id) - ntohs(cs->cs_ip.id); if(deltaS != 1){ cp = encode(cp,deltaS); changes |= NEW_I; } if(th->psh) changes |= TCP_PUSH_BIT; /* Grab the cksum before we overwrite it below. Then update our * state with this packet's header. */ csum = th->check; memcpy(&cs->cs_ip,ip,20); memcpy(&cs->cs_tcp,th,20); /* We want to use the original packet as our compressed packet. * (cp - new_seq) is the number of bytes we need for compressed * sequence numbers. In addition we need one byte for the change * mask, one for the connection id and two for the tcp checksum. * So, (cp - new_seq) + 4 bytes of header are needed. */ deltaS = cp - new_seq; if(compress_cid == 0 || comp->xmit_current != cs->cs_this){ cp = ocp; *cpp = ocp; *cp++ = changes | NEW_C; *cp++ = cs->cs_this; comp->xmit_current = cs->cs_this; } else { cp = ocp; *cpp = ocp; *cp++ = changes; } *(__sum16 *)cp = csum; cp += 2; /* deltaS is now the size of the change section of the compressed header */ memcpy(cp,new_seq,deltaS); /* Write list of deltas */ memcpy(cp+deltaS,icp+hlen,isize-hlen); comp->sls_o_compressed++; ocp[0] |= SL_TYPE_COMPRESSED_TCP; return isize - hlen + deltaS + (cp - ocp); /* Update connection state cs & send uncompressed packet (i.e., * a regular ip/tcp packet but with the 'conversation id' we hope * to use on future compressed packets in the protocol field). */ uncompressed: memcpy(&cs->cs_ip,ip,20); memcpy(&cs->cs_tcp,th,20); if (ip->ihl > 5) memcpy(cs->cs_ipopt, ip+1, ((ip->ihl) - 5) * 4); if (th->doff > 5) memcpy(cs->cs_tcpopt, th+1, ((th->doff) - 5) * 4); comp->xmit_current = cs->cs_this; comp->sls_o_uncompressed++; memcpy(ocp, icp, isize); *cpp = ocp; ocp[9] = cs->cs_this; ocp[0] |= SL_TYPE_UNCOMPRESSED_TCP; return isize; } int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) { int changes; long x; struct tcphdr *thp; struct iphdr *ip; struct cstate *cs; int len, hdrlen; unsigned char *cp = icp; /* We've got a compressed packet; read the change byte */ comp->sls_i_compressed++; if(isize < 3){ comp->sls_i_error++; return 0; } changes = *cp++; if(changes & NEW_C){ /* Make sure the state index is in range, then grab the state. * If we have a good state index, clear the 'discard' flag. */ x = *cp++; /* Read conn index */ if(x < 0 || x > comp->rslot_limit) goto bad; /* Check if the cstate is initialized */ if (!comp->rstate[x].initialized) goto bad; comp->flags &=~ SLF_TOSS; comp->recv_current = x; } else { /* this packet has an implicit state index. If we've * had a line error since the last time we got an * explicit state index, we have to toss the packet. */ if(comp->flags & SLF_TOSS){ comp->sls_i_tossed++; return 0; } } cs = &comp->rstate[comp->recv_current]; thp = &cs->cs_tcp; ip = &cs->cs_ip; thp->check = *(__sum16 *)cp; cp += 2; thp->psh = (changes & TCP_PUSH_BIT) ? 1 : 0; /* * we can use the same number for the length of the saved header and * the current one, because the packet wouldn't have been sent * as compressed unless the options were the same as the previous one */ hdrlen = ip->ihl * 4 + thp->doff * 4; switch(changes & SPECIALS_MASK){ case SPECIAL_I: /* Echoed terminal traffic */ { short i; i = ntohs(ip->tot_len) - hdrlen; thp->ack_seq = htonl( ntohl(thp->ack_seq) + i); thp->seq = htonl( ntohl(thp->seq) + i); } break; case SPECIAL_D: /* Unidirectional data */ thp->seq = htonl( ntohl(thp->seq) + ntohs(ip->tot_len) - hdrlen); break; default: if(changes & NEW_U){ thp->urg = 1; if((x = decode(&cp)) == -1) { goto bad; } thp->urg_ptr = htons(x); } else thp->urg = 0; if(changes & NEW_W){ if((x = decode(&cp)) == -1) { goto bad; } thp->window = htons( ntohs(thp->window) + x); } if(changes & NEW_A){ if((x = decode(&cp)) == -1) { goto bad; } thp->ack_seq = htonl( ntohl(thp->ack_seq) + x); } if(changes & NEW_S){ if((x = decode(&cp)) == -1) { goto bad; } thp->seq = htonl( ntohl(thp->seq) + x); } break; } if(changes & NEW_I){ if((x = decode(&cp)) == -1) { goto bad; } ip->id = htons (ntohs (ip->id) + x); } else ip->id = htons (ntohs (ip->id) + 1); /* * At this point, cp points to the first byte of data in the * packet. Put the reconstructed TCP and IP headers back on the * packet. Recalculate IP checksum (but not TCP checksum). */ len = isize - (cp - icp); if (len < 0) goto bad; len += hdrlen; ip->tot_len = htons(len); ip->check = 0; memmove(icp + hdrlen, cp, len - hdrlen); cp = icp; memcpy(cp, ip, 20); cp += 20; if (ip->ihl > 5) { memcpy(cp, cs->cs_ipopt, (ip->ihl - 5) * 4); cp += (ip->ihl - 5) * 4; } put_unaligned(ip_fast_csum(icp, ip->ihl), &((struct iphdr *)icp)->check); memcpy(cp, thp, 20); cp += 20; if (thp->doff > 5) { memcpy(cp, cs->cs_tcpopt, ((thp->doff) - 5) * 4); cp += ((thp->doff) - 5) * 4; } return len; bad: comp->sls_i_error++; return slhc_toss( comp ); } int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) { const struct tcphdr *th; unsigned char index; struct iphdr *iph; struct cstate *cs; unsigned int ihl; /* The packet is shorter than a legal IP header. * Also make sure isize is positive. */ if (isize < (int)sizeof(struct iphdr)) { runt: comp->sls_i_runt++; return slhc_toss(comp); } iph = (struct iphdr *)icp; /* Peek at the IP header's IHL field to find its length */ ihl = iph->ihl; /* The IP header length field is too small, * or packet is shorter than the IP header followed * by minimal tcp header. */ if (ihl < 5 || isize < ihl * 4 + sizeof(struct tcphdr)) goto runt; index = iph->protocol; iph->protocol = IPPROTO_TCP; if (ip_fast_csum(icp, ihl)) { /* Bad IP header checksum; discard */ comp->sls_i_badcheck++; return slhc_toss(comp); } if (index > comp->rslot_limit) { comp->sls_i_error++; return slhc_toss(comp); } th = (struct tcphdr *)(icp + ihl * 4); if (th->doff < sizeof(struct tcphdr) / 4) goto runt; if (isize < ihl * 4 + th->doff * 4) goto runt; /* Update local state */ cs = &comp->rstate[comp->recv_current = index]; comp->flags &=~ SLF_TOSS; memcpy(&cs->cs_ip, iph, sizeof(*iph)); memcpy(&cs->cs_tcp, th, sizeof(*th)); if (ihl > 5) memcpy(cs->cs_ipopt, &iph[1], (ihl - 5) * 4); if (th->doff > 5) memcpy(cs->cs_tcpopt, &th[1], (th->doff - 5) * 4); cs->cs_hsize = ihl*2 + th->doff*2; cs->initialized = true; /* Put headers back on packet * Neither header checksum is recalculated */ comp->sls_i_uncompressed++; return isize; } int slhc_toss(struct slcompress *comp) { if ( comp == NULLSLCOMPR ) return 0; comp->flags |= SLF_TOSS; return 0; } #else /* CONFIG_INET */ int slhc_toss(struct slcompress *comp) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_toss"); return -EINVAL; } int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_uncompress"); return -EINVAL; } int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize, unsigned char *ocp, unsigned char **cpp, int compress_cid) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_compress"); return -EINVAL; } int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_remember"); return -EINVAL; } void slhc_free(struct slcompress *comp) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_free"); } struct slcompress * slhc_init(int rslots, int tslots) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_init"); return NULL; } #endif /* CONFIG_INET */ /* VJ header compression */ EXPORT_SYMBOL(slhc_init); EXPORT_SYMBOL(slhc_free); EXPORT_SYMBOL(slhc_remember); EXPORT_SYMBOL(slhc_compress); EXPORT_SYMBOL(slhc_uncompress); EXPORT_SYMBOL(slhc_toss); MODULE_LICENSE("Dual BSD/GPL"); |
161 27 155 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TRACE_SYSCALL_H #define _TRACE_SYSCALL_H #include <linux/tracepoint.h> #include <linux/unistd.h> #include <linux/trace_events.h> #include <linux/thread_info.h> #include <asm/ptrace.h> /* * A syscall entry in the ftrace syscalls array. * * @name: name of the syscall * @syscall_nr: number of the syscall * @nb_args: number of parameters it takes * @types: list of types as strings * @args: list of args as strings (args[i] matches types[i]) * @enter_fields: list of fields for syscall_enter trace event * @enter_event: associated syscall_enter trace event * @exit_event: associated syscall_exit trace event */ struct syscall_metadata { const char *name; int syscall_nr; int nb_args; const char **types; const char **args; struct list_head enter_fields; struct trace_event_call *enter_event; struct trace_event_call *exit_event; }; #if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS) static inline void syscall_tracepoint_update(struct task_struct *p) { if (test_syscall_work(SYSCALL_TRACEPOINT)) set_task_syscall_work(p, SYSCALL_TRACEPOINT); else clear_task_syscall_work(p, SYSCALL_TRACEPOINT); } #else static inline void syscall_tracepoint_update(struct task_struct *p) { } #endif #endif /* _TRACE_SYSCALL_H */ |
3 3 3 3 3 3 3 3 3 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> #define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE) #define TAP_VNET_LE 0x80000000 #define TAP_VNET_BE 0x40000000 #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tap_legacy_is_little_endian(struct tap_queue *q) { return q->flags & TAP_VNET_BE ? false : virtio_legacy_is_little_endian(); } static long tap_get_vnet_be(struct tap_queue *q, int __user *sp) { int s = !!(q->flags & TAP_VNET_BE); if (put_user(s, sp)) return -EFAULT; return 0; } static long tap_set_vnet_be(struct tap_queue *q, int __user *sp) { int s; if (get_user(s, sp)) return -EFAULT; if (s) q->flags |= TAP_VNET_BE; else q->flags &= ~TAP_VNET_BE; return 0; } #else static inline bool tap_legacy_is_little_endian(struct tap_queue *q) { return virtio_legacy_is_little_endian(); } static long tap_get_vnet_be(struct tap_queue *q, int __user *argp) { return -EINVAL; } static long tap_set_vnet_be(struct tap_queue *q, int __user *argp) { return -EINVAL; } #endif /* CONFIG_TUN_VNET_CROSS_LE */ static inline bool tap_is_little_endian(struct tap_queue *q) { return q->flags & TAP_VNET_LE || tap_legacy_is_little_endian(q); } static inline u16 tap16_to_cpu(struct tap_queue *q, __virtio16 val) { return __virtio16_to_cpu(tap_is_little_endian(q), val); } static inline __virtio16 cpu_to_tap16(struct tap_queue *q, u16 val) { return __cpu_to_virtio16(tap_is_little_endian(q), val); } static struct proto tap_proto = { .name = "tap", .owner = THIS_MODULE, .obj_size = sizeof(struct tap_queue), }; #define TAP_NUM_DEVS (1U << MINORBITS) static LIST_HEAD(major_list); struct major_info { struct rcu_head rcu; dev_t major; struct idr minor_idr; spinlock_t minor_lock; const char *device_name; struct list_head next; }; #define GOODCOPY_LEN 128 static const struct proto_ops tap_socket_ops; #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST) static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } /* * RCU usage: * The tap_queue and the macvlan_dev are loosely coupled, the * pointers from one to the other can only be read while rcu_read_lock * or rtnl is held. * * Both the file and the macvlan_dev hold a reference on the tap_queue * through sock_hold(&q->sk). When the macvlan_dev goes away first, * q->vlan becomes inaccessible. When the files gets closed, * tap_get_queue() fails. * * There may still be references to the struct sock inside of the * queue from outbound SKBs, but these never reference back to the * file or the dev. The data structure is freed through __sk_free * when both our references and any pending SKBs are gone. */ static int tap_enable_queue(struct tap_dev *tap, struct file *file, struct tap_queue *q) { int err = -EINVAL; ASSERT_RTNL(); if (q->enabled) goto out; err = 0; rcu_assign_pointer(tap->taps[tap->numvtaps], q); q->queue_index = tap->numvtaps; q->enabled = true; tap->numvtaps++; out: return err; } /* Requires RTNL */ static int tap_set_queue(struct tap_dev *tap, struct file *file, struct tap_queue *q) { if (tap->numqueues == MAX_TAP_QUEUES) return -EBUSY; rcu_assign_pointer(q->tap, tap); rcu_assign_pointer(tap->taps[tap->numvtaps], q); sock_hold(&q->sk); q->file = file; q->queue_index = tap->numvtaps; q->enabled = true; file->private_data = q; list_add_tail(&q->next, &tap->queue_list); tap->numvtaps++; tap->numqueues++; return 0; } static int tap_disable_queue(struct tap_queue *q) { struct tap_dev *tap; struct tap_queue *nq; ASSERT_RTNL(); if (!q->enabled) return -EINVAL; tap = rtnl_dereference(q->tap); if (tap) { int index = q->queue_index; BUG_ON(index >= tap->numvtaps); nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]); nq->queue_index = index; rcu_assign_pointer(tap->taps[index], nq); RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL); q->enabled = false; tap->numvtaps--; } return 0; } /* * The file owning the queue got closed, give up both * the reference that the files holds as well as the * one from the macvlan_dev if that still exists. * * Using the spinlock makes sure that we don't get * to the queue again after destroying it. */ static void tap_put_queue(struct tap_queue *q) { struct tap_dev *tap; rtnl_lock(); tap = rtnl_dereference(q->tap); if (tap) { if (q->enabled) BUG_ON(tap_disable_queue(q)); tap->numqueues--; RCU_INIT_POINTER(q->tap, NULL); sock_put(&q->sk); list_del_init(&q->next); } rtnl_unlock(); synchronize_rcu(); sock_put(&q->sk); } /* * Select a queue based on the rxq of the device on which this packet * arrived. If the incoming device is not mq, calculate a flow hash * to select a queue. If all fails, find the first available queue. * Cache vlan->numvtaps since it can become zero during the execution * of this function. */ static struct tap_queue *tap_get_queue(struct tap_dev *tap, struct sk_buff *skb) { struct tap_queue *queue = NULL; /* Access to taps array is protected by rcu, but access to numvtaps * isn't. Below we use it to lookup a queue, but treat it as a hint * and validate that the result isn't NULL - in case we are * racing against queue removal. */ int numvtaps = READ_ONCE(tap->numvtaps); __u32 rxq; if (!numvtaps) goto out; if (numvtaps == 1) goto single; /* Check if we can use flow to select a queue */ rxq = skb_get_hash(skb); if (rxq) { queue = rcu_dereference(tap->taps[rxq % numvtaps]); goto out; } if (likely(skb_rx_queue_recorded(skb))) { rxq = skb_get_rx_queue(skb); while (unlikely(rxq >= numvtaps)) rxq -= numvtaps; queue = rcu_dereference(tap->taps[rxq]); goto out; } single: queue = rcu_dereference(tap->taps[0]); out: return queue; } /* * The net_device is going away, give up the reference * that it holds on all queues and safely set the pointer * from the queues to NULL. */ void tap_del_queues(struct tap_dev *tap) { struct tap_queue *q, *tmp; ASSERT_RTNL(); list_for_each_entry_safe(q, tmp, &tap->queue_list, next) { list_del_init(&q->next); RCU_INIT_POINTER(q->tap, NULL); if (q->enabled) tap->numvtaps--; tap->numqueues--; sock_put(&q->sk); } BUG_ON(tap->numvtaps); BUG_ON(tap->numqueues); /* guarantee that any future tap_set_queue will fail */ tap->numvtaps = MAX_TAP_QUEUES; } EXPORT_SYMBOL_GPL(tap_del_queues); rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct net_device *dev = skb->dev; struct tap_dev *tap; struct tap_queue *q; netdev_features_t features = TAP_FEATURES; tap = tap_dev_get_rcu(dev); if (!tap) return RX_HANDLER_PASS; q = tap_get_queue(tap, skb); if (!q) return RX_HANDLER_PASS; skb_push(skb, ETH_HLEN); /* Apply the forward feature mask so that we perform segmentation * according to users wishes. This only works if VNET_HDR is * enabled. */ if (q->flags & IFF_VNET_HDR) features |= tap->tap_features; if (netif_needs_gso(skb, features)) { struct sk_buff *segs = __skb_gso_segment(skb, features, false); struct sk_buff *next; if (IS_ERR(segs)) goto drop; if (!segs) { if (ptr_ring_produce(&q->ring, skb)) goto drop; goto wake_up; } consume_skb(skb); skb_list_walk_safe(segs, skb, next) { skb_mark_not_on_list(skb); if (ptr_ring_produce(&q->ring, skb)) { kfree_skb(skb); kfree_skb_list(next); break; } } } else { /* If we receive a partial checksum and the tap side * doesn't support checksum offload, compute the checksum. * Note: it doesn't matter which checksum feature to * check, we either support them all or none. */ if (skb->ip_summed == CHECKSUM_PARTIAL && !(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) goto drop; if (ptr_ring_produce(&q->ring, skb)) goto drop; } wake_up: wake_up_interruptible_poll(sk_sleep(&q->sk), EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); return RX_HANDLER_CONSUMED; drop: /* Count errors/drops only here, thus don't care about args. */ if (tap->count_rx_dropped) tap->count_rx_dropped(tap); kfree_skb(skb); return RX_HANDLER_CONSUMED; } EXPORT_SYMBOL_GPL(tap_handle_frame); static struct major_info *tap_get_major(int major) { struct major_info *tap_major; list_for_each_entry_rcu(tap_major, &major_list, next) { if (tap_major->major == major) return tap_major; } return NULL; } int tap_get_minor(dev_t major, struct tap_dev *tap) { int retval = -ENOMEM; struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(MAJOR(major)); if (!tap_major) { retval = -EINVAL; goto unlock; } spin_lock(&tap_major->minor_lock); retval = idr_alloc(&tap_major->minor_idr, tap, 1, TAP_NUM_DEVS, GFP_ATOMIC); if (retval >= 0) { tap->minor = retval; } else if (retval == -ENOSPC) { netdev_err(tap->dev, "Too many tap devices\n"); retval = -EINVAL; } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); return retval < 0 ? retval : 0; } EXPORT_SYMBOL_GPL(tap_get_minor); void tap_free_minor(dev_t major, struct tap_dev *tap) { struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(MAJOR(major)); if (!tap_major) { goto unlock; } spin_lock(&tap_major->minor_lock); if (tap->minor) { idr_remove(&tap_major->minor_idr, tap->minor); tap->minor = 0; } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(tap_free_minor); static struct tap_dev *dev_get_by_tap_file(int major, int minor) { struct net_device *dev = NULL; struct tap_dev *tap; struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(major); if (!tap_major) { tap = NULL; goto unlock; } spin_lock(&tap_major->minor_lock); tap = idr_find(&tap_major->minor_idr, minor); if (tap) { dev = tap->dev; dev_hold(dev); } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); return tap; } static void tap_sock_write_space(struct sock *sk) { wait_queue_head_t *wqueue; if (!sock_writeable(sk) || !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); } static void tap_sock_destruct(struct sock *sk) { struct tap_queue *q = container_of(sk, struct tap_queue, sk); ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb); } static int tap_open(struct inode *inode, struct file *file) { struct net *net = current->nsproxy->net_ns; struct tap_dev *tap; struct tap_queue *q; int err = -ENODEV; rtnl_lock(); tap = dev_get_by_tap_file(imajor(inode), iminor(inode)); if (!tap) goto err; err = -ENOMEM; q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tap_proto, 0); if (!q) goto err; if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) { sk_free(&q->sk); goto err; } init_waitqueue_head(&q->sock.wq.wait); q->sock.type = SOCK_RAW; q->sock.state = SS_CONNECTED; q->sock.file = file; q->sock.ops = &tap_socket_ops; sock_init_data_uid(&q->sock, &q->sk, current_fsuid()); q->sk.sk_write_space = tap_sock_write_space; q->sk.sk_destruct = tap_sock_destruct; q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); /* * so far only KVM virtio_net uses tap, enable zero copy between * guest kernel and host kernel when lower device supports zerocopy * * The macvlan supports zerocopy iff the lower device supports zero * copy so we don't have to look at the lower device directly. */ if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG)) sock_set_flag(&q->sk, SOCK_ZEROCOPY); err = tap_set_queue(tap, file, q); if (err) { /* tap_sock_destruct() will take care of freeing ptr_ring */ goto err_put; } dev_put(tap->dev); rtnl_unlock(); return err; err_put: sock_put(&q->sk); err: if (tap) dev_put(tap->dev); rtnl_unlock(); return err; } static int tap_release(struct inode *inode, struct file *file) { struct tap_queue *q = file->private_data; tap_put_queue(q); return 0; } static __poll_t tap_poll(struct file *file, poll_table *wait) { struct tap_queue *q = file->private_data; __poll_t mask = EPOLLERR; if (!q) goto out; mask = 0; poll_wait(file, &q->sock.wq.wait, wait); if (!ptr_ring_empty(&q->ring)) mask |= EPOLLIN | EPOLLRDNORM; if (sock_writeable(&q->sk) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) && sock_writeable(&q->sk))) mask |= EPOLLOUT | EPOLLWRNORM; out: return mask; } static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, 0); if (!skb) return NULL; skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } /* Neighbour code has some assumptions on HH_DATA_MOD alignment */ #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN) /* Get packet from user space buffer */ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, struct iov_iter *from, int noblock) { int good_linear = SKB_MAX_HEAD(TAP_RESERVE); struct sk_buff *skb; struct tap_dev *tap; unsigned long total_len = iov_iter_count(from); unsigned long len = total_len; int err; struct virtio_net_hdr vnet_hdr = { 0 }; int vnet_hdr_len = 0; int copylen = 0; int depth; bool zerocopy = false; size_t linear; if (q->flags & IFF_VNET_HDR) { vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); err = -EINVAL; if (len < vnet_hdr_len) goto err; len -= vnet_hdr_len; err = -EFAULT; if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from)) goto err; iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr)); if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && tap16_to_cpu(q, vnet_hdr.csum_start) + tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 > tap16_to_cpu(q, vnet_hdr.hdr_len)) vnet_hdr.hdr_len = cpu_to_tap16(q, tap16_to_cpu(q, vnet_hdr.csum_start) + tap16_to_cpu(q, vnet_hdr.csum_offset) + 2); err = -EINVAL; if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len) goto err; } err = -EINVAL; if (unlikely(len < ETH_HLEN)) goto err; if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { struct iov_iter i; copylen = vnet_hdr.hdr_len ? tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN; if (copylen > good_linear) copylen = good_linear; else if (copylen < ETH_HLEN) copylen = ETH_HLEN; linear = copylen; i = *from; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!zerocopy) { copylen = len; linear = tap16_to_cpu(q, vnet_hdr.hdr_len); if (linear > good_linear) linear = good_linear; else if (linear < ETH_HLEN) linear = ETH_HLEN; } skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen, linear, noblock, &err); if (!skb) goto err; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) goto err_kfree; skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; if (vnet_hdr_len) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, tap_is_little_endian(q)); if (err) goto err_kfree; } skb_probe_transport_header(skb); /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); rcu_read_lock(); tap = rcu_dereference(q->tap); /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->callback(NULL, uarg, false); } if (tap) { skb->dev = tap->dev; dev_queue_xmit(skb); } else { kfree_skb(skb); } rcu_read_unlock(); return total_len; err_kfree: kfree_skb(skb); err: rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); return err; } static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tap_queue *q = file->private_data; return tap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK); } /* Put packet to the user space buffer */ static ssize_t tap_put_user(struct tap_queue *q, const struct sk_buff *skb, struct iov_iter *iter) { int ret; int vnet_hdr_len = 0; int vlan_offset = 0; int total; if (q->flags & IFF_VNET_HDR) { int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; struct virtio_net_hdr vnet_hdr; vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); if (iov_iter_count(iter) < vnet_hdr_len) return -EINVAL; if (virtio_net_hdr_from_skb(skb, &vnet_hdr, tap_is_little_endian(q), true, vlan_hlen)) BUG(); if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) != sizeof(vnet_hdr)) return -EFAULT; iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr)); } total = vnet_hdr_len; total += skb->len; if (skb_vlan_tag_present(skb)) { struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; } veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); total += VLAN_HLEN; ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } ret = skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: return ret ? ret : total; } static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb) { DEFINE_WAIT(wait); ssize_t ret = 0; if (!iov_iter_count(to)) { kfree_skb(skb); return 0; } if (skb) goto put; while (1) { if (!noblock) prepare_to_wait(sk_sleep(&q->sk), &wait, TASK_INTERRUPTIBLE); /* Read frames from the queue */ skb = ptr_ring_consume(&q->ring); if (skb) break; if (noblock) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } /* Nothing to read, let's sleep */ schedule(); } if (!noblock) finish_wait(sk_sleep(&q->sk), &wait); put: if (skb) { ret = tap_put_user(q, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tap_queue *q = file->private_data; ssize_t len = iov_iter_count(to), ret; ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; return ret; } static struct tap_dev *tap_get_tap_dev(struct tap_queue *q) { struct tap_dev *tap; ASSERT_RTNL(); tap = rtnl_dereference(q->tap); if (tap) dev_hold(tap->dev); return tap; } static void tap_put_tap_dev(struct tap_dev *tap) { dev_put(tap->dev); } static int tap_ioctl_set_queue(struct file *file, unsigned int flags) { struct tap_queue *q = file->private_data; struct tap_dev *tap; int ret; tap = tap_get_tap_dev(q); if (!tap) return -EINVAL; if (flags & IFF_ATTACH_QUEUE) ret = tap_enable_queue(tap, file, q); else if (flags & IFF_DETACH_QUEUE) ret = tap_disable_queue(q); else ret = -EINVAL; tap_put_tap_dev(tap); return ret; } static int set_offload(struct tap_queue *q, unsigned long arg) { struct tap_dev *tap; netdev_features_t features; netdev_features_t feature_mask = 0; tap = rtnl_dereference(q->tap); if (!tap) return -ENOLINK; features = tap->dev->features; if (arg & TUN_F_CSUM) { feature_mask = NETIF_F_HW_CSUM; if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) feature_mask |= NETIF_F_TSO_ECN; if (arg & TUN_F_TSO4) feature_mask |= NETIF_F_TSO; if (arg & TUN_F_TSO6) feature_mask |= NETIF_F_TSO6; } } /* tun/tap driver inverts the usage for TSO offloads, where * setting the TSO bit means that the userspace wants to * accept TSO frames and turning it off means that user space * does not support TSO. * For tap, we have to invert it to mean the same thing. * When user space turns off TSO, we turn off GSO/LRO so that * user-space will not receive TSO frames. */ if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6)) features |= RX_OFFLOADS; else features &= ~RX_OFFLOADS; /* tap_features are the same as features on tun/tap and * reflect user expectations. */ tap->tap_features = feature_mask; if (tap->update_features) tap->update_features(tap, features); return 0; } /* * provide compatibility with generic tun/tap interface */ static long tap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tap_queue *q = file->private_data; struct tap_dev *tap; void __user *argp = (void __user *)arg; struct ifreq __user *ifr = argp; unsigned int __user *up = argp; unsigned short u; int __user *sp = argp; struct sockaddr sa; int s; int ret; switch (cmd) { case TUNSETIFF: /* ignore the name, just look at flags */ if (get_user(u, &ifr->ifr_flags)) return -EFAULT; ret = 0; if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP)) ret = -EINVAL; else q->flags = (q->flags & ~TAP_IFFEATURES) | u; return ret; case TUNGETIFF: rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = 0; u = q->flags; if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || put_user(u, &ifr->ifr_flags)) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case TUNSETQUEUE: if (get_user(u, &ifr->ifr_flags)) return -EFAULT; rtnl_lock(); ret = tap_ioctl_set_queue(file, u); rtnl_unlock(); return ret; case TUNGETFEATURES: if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up)) return -EFAULT; return 0; case TUNSETSNDBUF: if (get_user(s, sp)) return -EFAULT; if (s <= 0) return -EINVAL; q->sk.sk_sndbuf = s; return 0; case TUNGETVNETHDRSZ: s = q->vnet_hdr_sz; if (put_user(s, sp)) return -EFAULT; return 0; case TUNSETVNETHDRSZ: if (get_user(s, sp)) return -EFAULT; if (s < (int)sizeof(struct virtio_net_hdr)) return -EINVAL; q->vnet_hdr_sz = s; return 0; case TUNGETVNETLE: s = !!(q->flags & TAP_VNET_LE); if (put_user(s, sp)) return -EFAULT; return 0; case TUNSETVNETLE: if (get_user(s, sp)) return -EFAULT; if (s) q->flags |= TAP_VNET_LE; else q->flags &= ~TAP_VNET_LE; return 0; case TUNGETVNETBE: return tap_get_vnet_be(q, sp); case TUNSETVNETBE: return tap_set_vnet_be(q, sp); case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN | TUN_F_UFO)) return -EINVAL; rtnl_lock(); ret = set_offload(q, arg); rtnl_unlock(); return ret; case SIOCGIFHWADDR: rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = 0; dev_get_mac_address(&sa, dev_net(tap->dev), tap->dev->name); if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || copy_to_user(&ifr->ifr_hwaddr, &sa, sizeof(sa))) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case SIOCSIFHWADDR: if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa))) return -EFAULT; rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = dev_set_mac_address_user(tap->dev, &sa, NULL); tap_put_tap_dev(tap); rtnl_unlock(); return ret; default: return -EINVAL; } } static const struct file_operations tap_fops = { .owner = THIS_MODULE, .open = tap_open, .release = tap_release, .read_iter = tap_read_iter, .write_iter = tap_write_iter, .poll = tap_poll, .llseek = no_llseek, .unlocked_ioctl = tap_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) { struct tun_xdp_hdr *hdr = xdp->data_hard_start; struct virtio_net_hdr *gso = &hdr->gso; int buflen = hdr->buflen; int vnet_hdr_len = 0; struct tap_dev *tap; struct sk_buff *skb; int err, depth; if (unlikely(xdp->data_end - xdp->data < ETH_HLEN)) { err = -EINVAL; goto err; } if (q->flags & IFF_VNET_HDR) vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { err = -ENOMEM; goto err; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; if (vnet_hdr_len) { err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q)); if (err) goto err_kfree; } /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap) { skb->dev = tap->dev; skb_probe_transport_header(skb); dev_queue_xmit(skb); } else { kfree_skb(skb); } rcu_read_unlock(); return 0; err_kfree: kfree_skb(skb); err: rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); return err; } static int tap_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; int i; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { for (i = 0; i < ctl->num; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; tap_get_user_xdp(q, xdp); } return 0; } return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT); } static int tap_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct sk_buff *skb = m->msg_control; int ret; if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { kfree_skb(skb); return -EINVAL; } ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } return ret; } static int tap_peek_len(struct socket *sock) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag); } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tap_socket_ops = { .sendmsg = tap_sendmsg, .recvmsg = tap_recvmsg, .peek_len = tap_peek_len, }; /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tap_get_socket(struct file *file) { struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL); q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->sock; } EXPORT_SYMBOL_GPL(tap_get_socket); struct ptr_ring *tap_get_ptr_ring(struct file *file) { struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL); q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->ring; } EXPORT_SYMBOL_GPL(tap_get_ptr_ring); int tap_queue_resize(struct tap_dev *tap) { struct net_device *dev = tap->dev; struct tap_queue *q; struct ptr_ring **rings; int n = tap->numqueues; int ret, i = 0; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; list_for_each_entry(q, &tap->queue_list, next) rings[i++] = &q->ring; ret = ptr_ring_resize_multiple(rings, n, dev->tx_queue_len, GFP_KERNEL, __skb_array_destroy_skb); kfree(rings); return ret; } EXPORT_SYMBOL_GPL(tap_queue_resize); static int tap_list_add(dev_t major, const char *device_name) { struct major_info *tap_major; tap_major = kzalloc(sizeof(*tap_major), GFP_ATOMIC); if (!tap_major) return -ENOMEM; tap_major->major = MAJOR(major); idr_init(&tap_major->minor_idr); spin_lock_init(&tap_major->minor_lock); tap_major->device_name = device_name; list_add_tail_rcu(&tap_major->next, &major_list); return 0; } int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, const char *device_name, struct module *module) { int err; err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name); if (err) goto out1; cdev_init(tap_cdev, &tap_fops); tap_cdev->owner = module; err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS); if (err) goto out2; err = tap_list_add(*tap_major, device_name); if (err) goto out3; return 0; out3: cdev_del(tap_cdev); out2: unregister_chrdev_region(*tap_major, TAP_NUM_DEVS); out1: return err; } EXPORT_SYMBOL_GPL(tap_create_cdev); void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev) { struct major_info *tap_major, *tmp; cdev_del(tap_cdev); unregister_chrdev_region(major, TAP_NUM_DEVS); list_for_each_entry_safe(tap_major, tmp, &major_list, next) { if (tap_major->major == MAJOR(major)) { idr_destroy(&tap_major->minor_idr); list_del_rcu(&tap_major->next); kfree_rcu(tap_major, rcu); } } } EXPORT_SYMBOL_GPL(tap_destroy_cdev); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>"); MODULE_LICENSE("GPL"); |
23 2841 2840 6 2823 19 2807 2818 12 975 3 3 26 8 33 32 4 7 22 4 2803 1007 1897 30 978 981 977 3 9 2801 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/realpath.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/magic.h> #include <linux/proc_fs.h> /** * tomoyo_encode2 - Encode binary string to ascii string. * * @str: String in binary format. * @str_len: Size of @str in byte. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode2(const char *str, int str_len) { int i; int len = 0; const char *p = str; char *cp; char *cp0; if (!p) return NULL; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') len += 2; else if (c > ' ' && c < 127) len++; else len += 4; } len++; /* Reserve space for appending "/". */ cp = kzalloc(len + 10, GFP_NOFS); if (!cp) return NULL; cp0 = cp; p = str; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') { *cp++ = '\\'; *cp++ = '\\'; } else if (c > ' ' && c < 127) { *cp++ = c; } else { *cp++ = '\\'; *cp++ = (c >> 6) + '0'; *cp++ = ((c >> 3) & 7) + '0'; *cp++ = (c & 7) + '0'; } } return cp0; } /** * tomoyo_encode - Encode binary string to ascii string. * * @str: String in binary format. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode(const char *str) { return str ? tomoyo_encode2(str, strlen(str)) : NULL; } /** * tomoyo_get_absolute_path - Get the path of a dentry but ignores chroot'ed root. * * @path: Pointer to "struct path". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_absolute_path(const struct path *path, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { /* go to whatever namespace root we are under */ pos = d_absolute_path(path, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(path->dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_dentry_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { pos = dentry_path_raw(dentry, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_local_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. */ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, const int buflen) { struct super_block *sb = dentry->d_sb; char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen); if (IS_ERR(pos)) return pos; /* Convert from $PID to self if $PID is current thread. */ if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') { char *ep; const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10); struct pid_namespace *proc_pidns = proc_pid_ns(sb); if (*ep == '/' && pid && pid == task_tgid_nr_ns(current, proc_pidns)) { pos = ep - 5; if (pos < buffer) goto out; memmove(pos, "/self", 5); } goto prepend_filesystem_name; } /* Use filesystem name for unnamed devices. */ if (!MAJOR(sb->s_dev)) goto prepend_filesystem_name; { struct inode *inode = d_backing_inode(sb->s_root); /* * Use filesystem name if filesystem does not support rename() * operation. */ if (!inode->i_op->rename) goto prepend_filesystem_name; } /* Prepend device name. */ { char name[64]; int name_len; const dev_t dev = sb->s_dev; name[sizeof(name) - 1] = '\0'; snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev), MINOR(dev)); name_len = strlen(name); pos -= name_len; if (pos < buffer) goto out; memmove(pos, name, name_len); return pos; } /* Prepend filesystem name. */ prepend_filesystem_name: { const char *name = sb->s_type->name; const int name_len = strlen(name); pos -= name_len + 1; if (pos < buffer) goto out; memmove(pos, name, name_len); pos[name_len] = ':'; } return pos; out: return ERR_PTR(-ENOMEM); } /** * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root. * * @path: Pointer to "struct path". * * Returns the realpath of the given @path on success, NULL otherwise. * * If dentry is a directory, trailing '/' is appended. * Characters out of 0x20 < c < 0x7F range are converted to * \ooo style octal string. * Character \ is converted to \\ string. * * These functions use kzalloc(), so the caller must call kfree() * if these functions didn't return NULL. */ char *tomoyo_realpath_from_path(const struct path *path) { char *buf = NULL; char *name = NULL; unsigned int buf_len = PAGE_SIZE / 2; struct dentry *dentry = path->dentry; struct super_block *sb; if (!dentry) return NULL; sb = dentry->d_sb; while (1) { char *pos; struct inode *inode; buf_len <<= 1; kfree(buf); buf = kmalloc(buf_len, GFP_NOFS); if (!buf) break; /* To make sure that pos is '\0' terminated. */ buf[buf_len - 1] = '\0'; /* For "pipe:[\$]" and "socket:[\$]". */ if (dentry->d_op && dentry->d_op->d_dname) { pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1); goto encode; } inode = d_backing_inode(sb->s_root); /* * Get local name for filesystems without rename() operation * or dentry without vfsmount. */ if (!path->mnt || (!inode->i_op->rename && !(sb->s_type->fs_flags & FS_REQUIRES_DEV))) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); /* Get absolute name for the rest. */ else { pos = tomoyo_get_absolute_path(path, buf, buf_len - 1); /* * Fall back to local name if absolute name is not * available. */ if (pos == ERR_PTR(-EINVAL)) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); } encode: if (IS_ERR(pos)) continue; name = tomoyo_encode(pos); break; } kfree(buf); if (!name) tomoyo_warn_oom(__func__); return name; } /** * tomoyo_realpath_nofollow - Get realpath of a pathname. * * @pathname: The pathname to solve. * * Returns the realpath of @pathname on success, NULL otherwise. */ char *tomoyo_realpath_nofollow(const char *pathname) { struct path path; if (pathname && kern_path(pathname, 0, &path) == 0) { char *buf = tomoyo_realpath_from_path(&path); path_put(&path); return buf; } return NULL; } |
101 99 99 1 98 15 3 97 95 95 88 15 3 14 14 4 16 93 92 93 2 2 1 89 88 4 1 101 2 104 104 101 101 19 1 1 93 90 14 3 3 5 94 17 89 3 14 14 3 20 89 88 88 89 6 15 15 15 17 92 92 14 94 93 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 | // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking protocol helper module for SCTP. * * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. */ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/sctp.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <net/sctp/checksum.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_NONE] = "NONE", [SCTP_CONNTRACK_CLOSED] = "CLOSED", [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = 10 SECS, [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS, [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED #define sCW SCTP_CONNTRACK_COOKIE_WAIT #define sCE SCTP_CONNTRACK_COOKIE_ECHOED #define sES SCTP_CONNTRACK_ESTABLISHED #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT #define sIV SCTP_CONNTRACK_MAX /* These are the descriptions of the states: NOTE: These state names are tantalizingly similar to the states of an SCTP endpoint. But the interpretation of the states is a little different, considering that these are the states of the connection and not of an end point. Please note the subtleties. -Kiran NONE - Nothing so far. COOKIE WAIT - We have seen an INIT chunk in the original direction, or also an INIT_ACK chunk in the reply direction. COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. */ /* TODO - I have assumed that the first INIT is in the original direction. This messes things when an INIT comes in the reply direction in CLOSED state. - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - Full Multi Homing support. */ /* SCTP conntrack state transitions */ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ /* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ /* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, /* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]); } #endif #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ (offset) < (skb)->len && \ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned long *map) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; int flag; flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) flag = 1; /* * Cookie Ack/Echo chunks not the first OR * Init / Init Ack / Shutdown compl chunks not the only chunks * OR zero-length. */ if (((sch->type == SCTP_CID_COOKIE_ACK || sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { pr_debug("Basic checks failed\n"); return 1; } if (map) set_bit(sch->type, map); } pr_debug("Basic checks passed\n"); return count == 0; } static int sctp_new_state(enum ip_conntrack_dir dir, enum sctp_conntrack cur_state, int chunk_type) { int i; pr_debug("Chunk type: %d\n", chunk_type); switch (chunk_type) { case SCTP_CID_INIT: pr_debug("SCTP_CID_INIT\n"); i = 0; break; case SCTP_CID_INIT_ACK: pr_debug("SCTP_CID_INIT_ACK\n"); i = 1; break; case SCTP_CID_ABORT: pr_debug("SCTP_CID_ABORT\n"); i = 2; break; case SCTP_CID_SHUTDOWN: pr_debug("SCTP_CID_SHUTDOWN\n"); i = 3; break; case SCTP_CID_SHUTDOWN_ACK: pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); i = 4; break; case SCTP_CID_ERROR: pr_debug("SCTP_CID_ERROR\n"); i = 5; break; case SCTP_CID_COOKIE_ECHO: pr_debug("SCTP_CID_COOKIE_ECHO\n"); i = 6; break; case SCTP_CID_COOKIE_ACK: pr_debug("SCTP_CID_COOKIE_ACK\n"); i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; case SCTP_CID_HEARTBEAT: pr_debug("SCTP_CID_HEARTBEAT"); i = 9; break; case SCTP_CID_HEARTBEAT_ACK: pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type, Will stay in %s\n", sctp_conntrack_names[cur_state]); return cur_state; } pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", dir, sctp_conntrack_names[cur_state], chunk_type, sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); return sctp_conntracks[dir][i][cur_state]; } /* Don't need lock here: this conntrack not in circulation yet */ static noinline bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct sctphdr *sh, unsigned int dataoff) { enum sctp_conntrack new_state; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u32 offset, count; memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) { new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ if (new_state == SCTP_CONNTRACK_NONE || new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); return false; } /* Copy the vtag into the state info */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _inithdr, *ih; /* Sec 8.5.1 (A) */ if (sh->vtag) return false; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(_inithdr), &_inithdr); if (!ih) return false; pr_debug("Setting vtag %x for new conn\n", ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; } else if (sch->type == SCTP_CID_HEARTBEAT) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; } static bool sctp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { const struct sctphdr *sh; const char *logmsg; if (skb->len < dataoff + sizeof(struct sctphdr)) { logmsg = "nf_ct_sctp: short packet "; goto out_invalid; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } sh = (const struct sctphdr *)(skb->data + dataoff); if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { logmsg = "nf_ct_sctp: bad CRC "; goto out_invalid; } skb->ip_summed = CHECKSUM_UNNECESSARY; } return false; out_invalid: nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg); return true; } /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); const struct sctphdr *sh; struct sctphdr _sctph; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u_int32_t offset, count; unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; bool ignore = false; if (sctp_error(skb, dataoff, state)) return -NF_ACCEPT; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) goto out; if (do_basic_checks(ct, skb, dataoff, map) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ if (test_bit(SCTP_CID_ABORT, map) || test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || test_bit(SCTP_CID_COOKIE_ACK, map)) return -NF_ACCEPT; if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; } /* Check the verification tag (Sec 8.5) */ if (!test_bit(SCTP_CID_INIT, map) && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { pr_debug("Verification tag check failed\n"); goto out; } old_state = new_state = SCTP_CONNTRACK_NONE; spin_lock_bh(&ct->lock); for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { /* (B) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { /* (C) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ACK) { ct->proto.sctp.init[dir] = 0; ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.last_dir = dir; ignore = true; continue; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 || ct->proto.sctp.last_dir == dir) goto out_unlock; ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.vtag[dir] = sh->vtag; ct->proto.sctp.vtag[!dir] = 0; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } old_state = ct->proto.sctp.state; new_state = sctp_new_state(dir, old_state, sch->type); /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " "conntrack=%u\n", dir, sch->type, old_state); goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _ih, *ih; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) ct->proto.sctp.init[!dir] = 0; ct->proto.sctp.init[dir] = 1; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so * port reuse by client or NAT middlebox cannot * keep entry alive indefinitely (incl. nat info). */ if (new_state == SCTP_CONNTRACK_CLOSED && old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; } else if (sch->type == SCTP_CID_INIT_ACK) { struct sctp_inithdr _ih, *ih; __be32 vtag; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; vtag = ct->proto.sctp.vtag[!dir]; if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) goto out_unlock; /* collision */ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && vtag != ih->init_tag) goto out_unlock; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (new_state == SCTP_CONNTRACK_ESTABLISHED && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } } spin_unlock_bh(&ct->lock); /* allow but do not refresh timeout */ if (ignore) return NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; out_unlock: spin_unlock_bh(&ct->lock); out: return -NF_ACCEPT; } static bool sctp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.sctp.state) { case SCTP_CONNTRACK_SHUTDOWN_SENT: case SCTP_CONNTRACK_SHUTDOWN_RECD: case SCTP_CONNTRACK_SHUTDOWN_ACK_SENT: return true; default: break; } return false; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) goto nla_put_failure; skip_state: spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, }; #define SCTP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 4) + \ NLA_ALIGN(NLA_HDRLEN + 4)) static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1]; int err; /* updates may not contain the internal protocol info, skip parsing */ if (!attr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_SCTP_MAX, attr, sctp_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_PROTOINFO_SCTP_STATE] || !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] || !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]) return -EINVAL; spin_lock_bh(&ct->lock); ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); spin_unlock_bh(&ct->lock); return 0; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; if (!timeouts) timeouts = sn->timeouts; /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; /* there's a 1:1 mapping between attributes and protocol states. */ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (tb[i]) { timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED]; return 0; } static int sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; int i; for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) goto nla_put_failure; } return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_sctp_init_net(struct net *net) { struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; for (i = 0; i < SCTP_CONNTRACK_MAX; i++) sn->timeouts[i] = sctp_timeouts[i]; /* timeouts[0] is unused, init it so ->timeouts[0] contains * 'new' timeout, like udp or icmp. */ sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { .l4proto = IPPROTO_SCTP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .can_early_drop = sctp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = SCTP_NLATTR_SIZE, .to_nlattr = sctp_to_nlattr, .from_nlattr = nlattr_to_sctp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = sctp_timeout_nlattr_to_obj, .obj_to_nlattr = sctp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_SCTP_MAX, .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, .nla_policy = sctp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
240 95 2761 2509 896 2386 1446 582 1665 2387 9 914 2337 3007 3007 2858 2303 81 81 81 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> (C) 2002 David Woodhouse <dwmw2@infradead.org> (C) 2012 Michel Lespinasse <walken@google.com> linux/include/linux/rbtree_augmented.h */ #ifndef _LINUX_RBTREE_AUGMENTED_H #define _LINUX_RBTREE_AUGMENTED_H #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> /* * Please note - only struct rb_augment_callbacks and the prototypes for * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { void (*propagate)(struct rb_node *node, struct rb_node *stop); void (*copy)(struct rb_node *old, struct rb_node *new); void (*rotate)(struct rb_node *old, struct rb_node *new); }; extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and * rb_insert_augmented() instead of the usual rb_insert_color() call. * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { __rb_insert_augmented(node, root, augment->rotate); } static inline void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { if (newleft) root->rb_leftmost = node; rb_insert_augmented(node, &root->rb_root, augment); } /* * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ #define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ if (RBCOMPUTE(node, true)) \ break; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ .copy = RBNAME ## _copy, \ .rotate = RBNAME ## _rotate \ }; /* * Template for declaring augmented rbtree callbacks, * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBTYPE: type of the RBAUGMENTED field * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar */ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ if (node->RBFIELD.rb_left) { \ child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (node->RBFIELD.rb_right) { \ child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (exit && node->RBAUGMENTED == max) \ return true; \ node->RBAUGMENTED = max; \ return false; \ } \ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 #define RB_BLACK 1 #define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) #define __rb_color(pc) ((pc) & 1) #define __rb_is_black(pc) __rb_color(pc) #define __rb_is_red(pc) (!__rb_color(pc)) #define rb_color(rb) __rb_color((rb)->__rb_parent_color) #define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) #define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { rb->__rb_parent_color = (unsigned long)p | color; } static inline void __rb_change_child(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) WRITE_ONCE(parent->rb_left, new); else WRITE_ONCE(parent->rb_right, new); } else WRITE_ONCE(root->rb_node, new); } static inline void __rb_change_child_rcu(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) rcu_assign_pointer(parent->rb_left, new); else rcu_assign_pointer(parent->rb_right, new); } else rcu_assign_pointer(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) * * Note that if there is one child it must be red due to 5) * and node must be black due to 4). We adjust colors locally * so as to bypass __rb_erase_color() later on. */ pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, child, parent, root); if (child) { child->__rb_parent_color = pc; rebalance = NULL; } else rebalance = __rb_is_black(pc) ? parent : NULL; tmp = parent; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ tmp->__rb_parent_color = pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, tmp, parent, root); rebalance = NULL; tmp = parent; } else { struct rb_node *successor = child, *child2; tmp = child->rb_left; if (!tmp) { /* * Case 2: node's successor is its right child * * (n) (s) * / \ / \ * (x) (s) -> (x) (c) * \ * (c) */ parent = successor; child2 = successor->rb_right; augment->copy(node, successor); } else { /* * Case 3: node's successor is leftmost under * node's right child subtree * * (n) (s) * / \ / \ * (x) (y) -> (x) (y) * / / * (p) (p) * / / * (s) (c) * \ * (c) */ do { parent = successor; successor = tmp; tmp = tmp->rb_left; } while (tmp); child2 = successor->rb_right; WRITE_ONCE(parent->rb_left, child2); WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); augment->copy(node, successor); augment->propagate(parent, successor); } tmp = node->rb_left; WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); if (child2) { rb_set_parent_color(child2, parent, RB_BLACK); rebalance = NULL; } else { rebalance = rb_is_black(successor) ? parent : NULL; } successor->__rb_parent_color = pc; tmp = successor; } augment->propagate(tmp, NULL); return rebalance; } static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { if (root->rb_leftmost == node) root->rb_leftmost = rb_next(node); rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _LINUX_RBTREE_AUGMENTED_H */ |
17553 17553 17 2344 2368 17553 14 1 17553 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_CPUMASK_H #define __LINUX_CPUMASK_H /* * Cpumasks provide a bitmap suitable for representing the * set of CPU's in a system, one bit position per CPU number. In general, * only nr_cpu_ids (<= NR_CPUS) bits are valid. */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/atomic.h> #include <linux/bug.h> /* Don't assign or return these: may not be this big! */ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; /** * cpumask_bits - get the bits in a cpumask * @maskp: the struct cpumask * * * You should only assume nr_cpu_ids bits of this mask are valid. This is * a macro so it's const-correct. */ #define cpumask_bits(maskp) ((maskp)->bits) /** * cpumask_pr_args - printf args to output a cpumask * @maskp: cpumask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a cpumask. */ #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) #if NR_CPUS == 1 #define nr_cpu_ids 1U #else extern unsigned int nr_cpu_ids; #endif #ifdef CONFIG_CPUMASK_OFFSTACK /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ #define nr_cpumask_bits nr_cpu_ids #else #define nr_cpumask_bits ((unsigned int)NR_CPUS) #endif /* * The following particular system cpumasks and operations manage * possible, present, active and online cpus. * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * * The cpu_possible_mask is fixed at boot time, as the set of CPU id's * that it is possible might ever be plugged in at anytime during the * life of that system boot. The cpu_present_mask is dynamic(*), * representing which CPUs are currently plugged in. And * cpu_online_mask is the dynamic subset of cpu_present_mask, * indicating those CPUs available for scheduling. * * If HOTPLUG is enabled, then cpu_possible_mask is forced to have * all NR_CPUS bits set, otherwise it is just the set of CPUs that * ACPI reports present at boot. * * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise * cpu_present_mask is just a copy of cpu_possible_mask. * * (*) Well, cpu_present_mask is dynamic in the hotplug case. If not * hotplug, it's a copy of cpu_possible_mask, hence fixed at boot. * * Subtleties: * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP * cpu_{online,possible,present}_masks are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. */ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; extern struct cpumask __cpu_dying_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; extern cpumask_t cpus_booted_once_mask; static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ } /* verify cpu argument to cpumask_* operators */ static inline unsigned int cpumask_check(unsigned int cpu) { cpu_max_bits_warn(cpu, nr_cpumask_bits); return cpu; } #if NR_CPUS == 1 /* Uniprocessor. Assume all masks are "1". */ static inline unsigned int cpumask_first(const struct cpumask *srcp) { return 0; } static inline unsigned int cpumask_last(const struct cpumask *srcp) { return 0; } /* Valid inputs for n are -1 and 0. */ static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { return n+1; } static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { return n+1; } static inline unsigned int cpumask_next_and(int n, const struct cpumask *srcp, const struct cpumask *andp) { return n+1; } static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { /* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */ return (wrap && n == 0); } /* cpu must be a valid cpu, ie 0, so there's no other choice. */ static inline unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) { return 1; } static inline unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } static inline int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { return cpumask_next_and(-1, src1p, src2p); } static inline int cpumask_any_distribute(const struct cpumask *srcp) { return cpumask_first(srcp); } #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) #define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) #else /** * cpumask_first - get the first cpu in a cpumask * @srcp: the cpumask pointer * * Returns >= nr_cpu_ids if no cpus set. */ static inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer * * Returns >= nr_cpumask_bits if no CPUs set. */ static inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); } unsigned int __pure cpumask_next(int n, const struct cpumask *srcp); /** * cpumask_next_zero - get the next unset cpu in a cpumask * @n: the cpu prior to the place to search (ie. return will be > @n) * @srcp: the cpumask pointer * * Returns >= nr_cpu_ids if no further cpus unset. */ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); } int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu); unsigned int cpumask_local_spread(unsigned int i, int node); int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); int cpumask_any_distribute(const struct cpumask *srcp); /** * for_each_cpu - iterate over every cpu in a mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu(cpu, mask) \ for ((cpu) = -1; \ (cpu) = cpumask_next((cpu), (mask)), \ (cpu) < nr_cpu_ids;) /** * for_each_cpu_not - iterate over every cpu in a complemented mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_not(cpu, mask) \ for ((cpu) = -1; \ (cpu) = cpumask_next_zero((cpu), (mask)), \ (cpu) < nr_cpu_ids;) extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * @start: the start location * * The implementation does not assume any bit in @mask is set (including @start). * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \ (cpu) < nr_cpumask_bits; \ (cpu) = cpumask_next_wrap((cpu), (mask), (start), true)) /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = -1; \ (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \ (cpu) < nr_cpu_ids;) #endif /* SMP */ #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } #define CPU_BITS_CPU0 \ { \ [0] = 1UL \ } /** * cpumask_set_cpu - set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) { clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) { __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in @cpumask, else returns 0 */ static inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask) { return test_bit(cpumask_check(cpu), cpumask_bits((cpumask))); } /** * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0 * * test_and_set_bit wrapper for cpumasks. */ static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) { return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0 * * test_and_clear_bit wrapper for cpumasks. */ static inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask) { return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static inline void cpumask_setall(struct cpumask *dstp) { bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_and - *dstp = *src1p & *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * If *@dstp is empty, returns 0, else returns 1 */ static inline int cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_or - *dstp = *src1p | *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static inline void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_andnot - *dstp = *src1p & ~*src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * If *@dstp is empty, returns 0, else returns 1 */ static inline int cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_complement - *dstp = ~*srcp * @dstp: the cpumask result * @srcp: the input to invert */ static inline void cpumask_complement(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_equal - *src1p == *src2p * @src1p: the first input * @src2p: the second input */ static inline bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_or_equal - *src1p | *src2p == *src3p * @src1p: the first input * @src2p: the second input * @src3p: the third input */ static inline bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p, const struct cpumask *src3p) { return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p), cpumask_bits(src3p), nr_cpumask_bits); } /** * cpumask_intersects - (*src1p & *src2p) != 0 * @src1p: the first input * @src2p: the second input */ static inline bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_subset - (*src1p & ~*src2p) == 0 * @src1p: the first input * @src2p: the second input * * Returns 1 if *@src1p is a subset of *@src2p, else returns 0 */ static inline int cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_empty - *srcp == 0 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. */ static inline bool cpumask_empty(const struct cpumask *srcp) { return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_full - *srcp == 0xFFFFFFFF... * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. */ static inline bool cpumask_full(const struct cpumask *srcp) { return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_weight - Count of bits in *srcp * @srcp: the cpumask to count bits (< nr_cpu_ids) in. */ static inline unsigned int cpumask_weight(const struct cpumask *srcp) { return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_shift_right - *dstp = *srcp >> n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static inline void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); } /** * cpumask_shift_left - *dstp = *srcp << n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static inline void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); } /** * cpumask_copy - *dstp = *srcp * @dstp: the result * @srcp: the input cpumask */ static inline void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_any - pick a "random" cpu from *srcp * @srcp: the input cpumask * * Returns >= nr_cpu_ids if no cpus set. */ #define cpumask_any(srcp) cpumask_first(srcp) /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @src1p: the first input * @src2p: the second input * * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ #define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p)) /** * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 * @mask1: the first input cpumask * @mask2: the second input cpumask * * Returns >= nr_cpu_ids if no cpus set. */ #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2)) /** * cpumask_of - the cpumask containing just a given cpu * @cpu: the cpu (<= nr_cpu_ids) */ #define cpumask_of(cpu) (get_cpu_mask(cpu)) /** * cpumask_parse_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parselist_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parse - extract a cpumask from a string * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) { return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpulist_parse - extract a cpumask from a user string of ranges * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) { return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_size - size to allocate for a 'struct cpumask' in bytes */ static inline unsigned int cpumask_size(void) { return bitmap_size(nr_cpumask_bits); } /* * cpumask_var_t: struct cpumask for stack usage. * * Oh, the wicked games we play! In order to make kernel coding a * little more difficult, we typedef cpumask_var_t to an array or a * pointer: doing &mask on an array is a noop, so it still works. * * ie. * cpumask_var_t tmpmask; * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) * return -ENOMEM; * * ... use 'tmpmask' like a normal struct cpumask * ... * * free_cpumask_var(tmpmask); * * * However, one notable exception is there. alloc_cpumask_var() allocates * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t. * * cpumask_var_t tmpmask; * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) * return -ENOMEM; * * var = *tmpmask; * * This code makes NR_CPUS length memcopy and brings to a memory corruption. * cpumask_copy() provide safe copy functionality. * * Note that there is another evil here: If you define a cpumask_var_t * as a percpu variable then the way to obtain the address of the cpumask * structure differently influences what this_cpu_* operation needs to be * used. Please use this_cpu_cpumask_var_t in those cases. The direct use * of this_cpu_ptr() or this_cpu_read() will lead to failures when the * other type of cpumask_var_t implementation is configured. * * Please also note that __cpumask_var_read_mostly can be used to declare * a cpumask_var_t variable itself (not its content) as read mostly. */ #ifdef CONFIG_CPUMASK_OFFSTACK typedef struct cpumask *cpumask_var_t; #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); static inline bool cpumask_available(cpumask_var_t mask) { return mask != NULL; } #else typedef struct cpumask cpumask_var_t[1]; #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; } static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return true; } static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { cpumask_clear(*mask); return true; } static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { cpumask_clear(*mask); return true; } static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } static inline void free_cpumask_var(cpumask_var_t mask) { } static inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } static inline bool cpumask_available(cpumask_var_t mask) { return true; } #endif /* CONFIG_CPUMASK_OFFSTACK */ /* It's common to want to use cpu_all_mask in struct member initializers, * so it has to refer to an address rather than a pointer. */ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define cpu_all_mask to_cpumask(cpu_all_bits) /* First bits of cpu_bit_bitmap are in fact unset. */ #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); static inline void reset_cpu_possible_mask(void) { bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS); } static inline void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) cpumask_set_cpu(cpu, &__cpu_possible_mask); else cpumask_clear_cpu(cpu, &__cpu_possible_mask); } static inline void set_cpu_present(unsigned int cpu, bool present) { if (present) cpumask_set_cpu(cpu, &__cpu_present_mask); else cpumask_clear_cpu(cpu, &__cpu_present_mask); } void set_cpu_online(unsigned int cpu, bool online); static inline void set_cpu_active(unsigned int cpu, bool active) { if (active) cpumask_set_cpu(cpu, &__cpu_active_mask); else cpumask_clear_cpu(cpu, &__cpu_active_mask); } static inline void set_cpu_dying(unsigned int cpu, bool dying) { if (dying) cpumask_set_cpu(cpu, &__cpu_dying_mask); else cpumask_clear_cpu(cpu, &__cpu_dying_mask); } /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap * * There are a few places where cpumask_var_t isn't appropriate and * static cpumasks must be used (eg. very early boot), yet we don't * expose the definition of 'struct cpumask'. * * This does the conversion, and can be used as a constant initializer. */ #define to_cpumask(bitmap) \ ((struct cpumask *)(1 ? (bitmap) \ : (void *)sizeof(__check_is_bitmap(bitmap)))) static inline int __check_is_bitmap(const unsigned long *bitmap) { return 1; } /* * Special-case data structure for "single bit set only" constant CPU masks. * * We pre-generate all the 64 (or 32) possible bit positions, with enough * padding to the left and the right, and return the constant pointer * appropriately offset. */ extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; static inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return to_cpumask(p); } #if NR_CPUS > 1 /** * num_online_cpus() - Read the number of online CPUs * * Despite the fact that __num_online_cpus is of type atomic_t, this * interface gives only a momentary snapshot and is not protected against * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. */ static inline unsigned int num_online_cpus(void) { return atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) static inline bool cpu_online(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_online_mask); } static inline bool cpu_possible(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_possible_mask); } static inline bool cpu_present(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_present_mask); } static inline bool cpu_active(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_active_mask); } static inline bool cpu_dying(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_dying_mask); } #else #define num_online_cpus() 1U #define num_possible_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U static inline bool cpu_online(unsigned int cpu) { return cpu == 0; } static inline bool cpu_possible(unsigned int cpu) { return cpu == 0; } static inline bool cpu_present(unsigned int cpu) { return cpu == 0; } static inline bool cpu_active(unsigned int cpu) { return cpu == 0; } static inline bool cpu_dying(unsigned int cpu) { return false; } #endif /* NR_CPUS > 1 */ #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) #if NR_CPUS <= BITS_PER_LONG #define CPU_BITS_ALL \ { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #else /* NR_CPUS > BITS_PER_LONG */ #define CPU_BITS_ALL \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #endif /* NR_CPUS > BITS_PER_LONG */ /** * cpumap_print_to_pagebuf - copies the cpumask into the buffer either * as comma-separated list of cpus or hex values of cpumask * @list: indicates whether the cpumap must be list * @mask: the cpumask to copy * @buf: the buffer to copy into * * Returns the length of the (null-terminated) @buf string, zero if * nothing is copied. */ static inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), nr_cpu_ids); } /** * cpumap_print_bitmask_to_buf - copies the cpumask into the buffer as * hex values of cpumask * * @buf: the buffer to copy into * @mask: the cpumask to copy * @off: in the string from which we are copying, we copy to @buf * @count: the maximum number of bytes to print * * The function prints the cpumask into the buffer as hex values of * cpumask; Typically used by bin_attribute to export cpumask bitmask * ABI. * * Returns the length of how many bytes have been copied, excluding * terminating '\0'. */ static inline ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; } /** * cpumap_print_list_to_buf - copies the cpumask into the buffer as * comma-separated list of cpus * * Everything is same with the above cpumap_print_bitmask_to_buf() * except the print format. */ static inline ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_list_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; } #if NR_CPUS <= BITS_PER_LONG #define CPU_MASK_ALL \ (cpumask_t) { { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #else #define CPU_MASK_ALL \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #endif /* NR_CPUS > BITS_PER_LONG */ #define CPU_MASK_NONE \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } } #define CPU_MASK_CPU0 \ (cpumask_t) { { \ [0] = 1UL \ } } /* * Provide a valid theoretical max size for cpumap and cpulist sysfs files * to avoid breaking userspace which may allocate a buffer based on the size * reported by e.g. fstat. * * for cpumap NR_CPUS * 9/32 - 1 should be an exact length. * * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up * to 2 orders of magnitude larger than 8192. And then we divide by 2 to * cover a worst-case of every other cpu being on one of two nodes for a * very large NR_CPUS. * * Use PAGE_SIZE as a minimum for smaller configurations while avoiding * unsigned comparison to -1. */ #define CPUMAP_FILE_MAX_BYTES (((NR_CPUS * 9)/32 > PAGE_SIZE) \ ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE) #define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE) #endif /* __LINUX_CPUMASK_H */ |
890 925 98 78 38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 | // SPDX-License-Identifier: GPL-2.0-only /* net/atm/clip.c - RFC1577 Classical IP over ATM */ /* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> /* for UINT_MAX */ #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/wait.h> #include <linux/timer.h> #include <linux/if_arp.h> /* for some manifest constants */ #include <linux/notifier.h> #include <linux/atm.h> #include <linux/atmdev.h> #include <linux/atmclip.h> #include <linux/atmarp.h> #include <linux/capability.h> #include <linux/ip.h> /* for net/route.h */ #include <linux/in.h> /* for struct sockaddr_in */ #include <linux/if.h> /* for IFF_UP */ #include <linux/inetdevice.h> #include <linux/bitops.h> #include <linux/poison.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <linux/jhash.h> #include <linux/slab.h> #include <net/route.h> /* for struct rtable and routing */ #include <net/icmp.h> /* icmp_send */ #include <net/arp.h> #include <linux/param.h> /* for HZ */ #include <linux/uaccess.h> #include <asm/byteorder.h> /* for htons etc. */ #include <linux/atomic.h> #include "common.h" #include "resources.h" #include <net/atmclip.h> static struct net_device *clip_devs; static struct atm_vcc *atmarpd; static struct timer_list idle_timer; static const struct neigh_ops clip_neigh_ops; static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip) { struct sock *sk; struct atmarp_ctrl *ctrl; struct sk_buff *skb; pr_debug("(%d)\n", type); if (!atmarpd) return -EUNATCH; skb = alloc_skb(sizeof(struct atmarp_ctrl), GFP_ATOMIC); if (!skb) return -ENOMEM; ctrl = skb_put(skb, sizeof(struct atmarp_ctrl)); ctrl->type = type; ctrl->itf_num = itf; ctrl->ip = ip; atm_force_charge(atmarpd, skb->truesize); sk = sk_atm(atmarpd); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return 0; } static void link_vcc(struct clip_vcc *clip_vcc, struct atmarp_entry *entry) { pr_debug("%p to entry %p (neigh %p)\n", clip_vcc, entry, entry->neigh); clip_vcc->entry = entry; clip_vcc->xoff = 0; /* @@@ may overrun buffer by one packet */ clip_vcc->next = entry->vccs; entry->vccs = clip_vcc; entry->neigh->used = jiffies; } static void unlink_clip_vcc(struct clip_vcc *clip_vcc) { struct atmarp_entry *entry = clip_vcc->entry; struct clip_vcc **walk; if (!entry) { pr_err("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); return; } netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */ entry->neigh->used = jiffies; for (walk = &entry->vccs; *walk; walk = &(*walk)->next) if (*walk == clip_vcc) { int error; *walk = clip_vcc->next; /* atomic */ clip_vcc->entry = NULL; if (clip_vcc->xoff) netif_wake_queue(entry->neigh->dev); if (entry->vccs) goto out; entry->expires = jiffies - 1; /* force resolution or expiration */ error = neigh_update(entry->neigh, NULL, NUD_NONE, NEIGH_UPDATE_F_ADMIN, 0); if (error) pr_err("neigh_update failed with %d\n", error); goto out; } pr_err("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc); out: netif_tx_unlock_bh(entry->neigh->dev); } /* The neighbour entry n->lock is held. */ static int neigh_check_cb(struct neighbour *n) { struct atmarp_entry *entry = neighbour_priv(n); struct clip_vcc *cv; if (n->ops != &clip_neigh_ops) return 0; for (cv = entry->vccs; cv; cv = cv->next) { unsigned long exp = cv->last_use + cv->idle_timeout; if (cv->idle_timeout && time_after(jiffies, exp)) { pr_debug("releasing vcc %p->%p of entry %p\n", cv, cv->vcc, entry); vcc_release_async(cv->vcc, -ETIMEDOUT); } } if (entry->vccs || time_before(jiffies, entry->expires)) return 0; if (refcount_read(&n->refcnt) > 1) { struct sk_buff *skb; pr_debug("destruction postponed with ref %d\n", refcount_read(&n->refcnt)); while ((skb = skb_dequeue(&n->arp_queue)) != NULL) dev_kfree_skb(skb); return 0; } pr_debug("expired neigh %p\n", n); return 1; } static void idle_timer_check(struct timer_list *unused) { write_lock(&arp_tbl.lock); __neigh_for_each_release(&arp_tbl, neigh_check_cb); mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); write_unlock(&arp_tbl.lock); } static int clip_arp_rcv(struct sk_buff *skb) { struct atm_vcc *vcc; pr_debug("\n"); vcc = ATM_SKB(skb)->vcc; if (!vcc || !atm_charge(vcc, skb->truesize)) { dev_kfree_skb_any(skb); return 0; } pr_debug("pushing to %p\n", vcc); pr_debug("using %p\n", CLIP_VCC(vcc)->old_push); CLIP_VCC(vcc)->old_push(vcc, skb); return 0; } static const unsigned char llc_oui[] = { 0xaa, /* DSAP: non-ISO */ 0xaa, /* SSAP: non-ISO */ 0x03, /* Ctrl: Unnumbered Information Command PDU */ 0x00, /* OUI: EtherType */ 0x00, 0x00 }; static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) { struct clip_vcc *clip_vcc = CLIP_VCC(vcc); pr_debug("\n"); if (!clip_devs) { atm_return(vcc, skb->truesize); kfree_skb(skb); return; } if (!skb) { pr_debug("removing VCC %p\n", clip_vcc); if (clip_vcc->entry) unlink_clip_vcc(clip_vcc); clip_vcc->old_push(vcc, NULL); /* pass on the bad news */ kfree(clip_vcc); return; } atm_return(vcc, skb->truesize); skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs; /* clip_vcc->entry == NULL if we don't have an IP address yet */ if (!skb->dev) { dev_kfree_skb_any(skb); return; } ATM_SKB(skb)->vcc = vcc; skb_reset_mac_header(skb); if (!clip_vcc->encap || skb->len < RFC1483LLC_LEN || memcmp(skb->data, llc_oui, sizeof(llc_oui))) skb->protocol = htons(ETH_P_IP); else { skb->protocol = ((__be16 *)skb->data)[3]; skb_pull(skb, RFC1483LLC_LEN); if (skb->protocol == htons(ETH_P_ARP)) { skb->dev->stats.rx_packets++; skb->dev->stats.rx_bytes += skb->len; clip_arp_rcv(skb); return; } } clip_vcc->last_use = jiffies; skb->dev->stats.rx_packets++; skb->dev->stats.rx_bytes += skb->len; memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); netif_rx(skb); } /* * Note: these spinlocks _must_not_ block on non-SMP. The only goal is that * clip_pop is atomic with respect to the critical section in clip_start_xmit. */ static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb) { struct clip_vcc *clip_vcc = CLIP_VCC(vcc); struct net_device *dev = skb->dev; int old; unsigned long flags; pr_debug("(vcc %p)\n", vcc); clip_vcc->old_pop(vcc, skb); /* skb->dev == NULL in outbound ARP packets */ if (!dev) return; spin_lock_irqsave(&PRIV(dev)->xoff_lock, flags); if (atm_may_send(vcc, 0)) { old = xchg(&clip_vcc->xoff, 0); if (old) netif_wake_queue(dev); } spin_unlock_irqrestore(&PRIV(dev)->xoff_lock, flags); } static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 *ip = (__be32 *) neigh->primary_key; pr_debug("(neigh %p, skb %p)\n", neigh, skb); to_atmarpd(act_need, PRIV(neigh->dev)->number, *ip); } static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb) { #ifndef CONFIG_ATM_CLIP_NO_ICMP icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); #endif kfree_skb(skb); } static const struct neigh_ops clip_neigh_ops = { .family = AF_INET, .solicit = clip_neigh_solicit, .error_report = clip_neigh_error, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; static int clip_constructor(struct net_device *dev, struct neighbour *neigh) { struct atmarp_entry *entry = neighbour_priv(neigh); if (neigh->tbl->family != AF_INET) return -EINVAL; if (neigh->type != RTN_UNICAST) return -EINVAL; neigh->nud_state = NUD_NONE; neigh->ops = &clip_neigh_ops; neigh->output = neigh->ops->output; entry->neigh = neigh; entry->vccs = NULL; entry->expires = jiffies - 1; return 0; } /* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */ /* * We play with the resolve flag: 0 and 1 have the usual meaning, but -1 means * to allocate the neighbour entry but not to ask atmarpd for resolution. Also, * don't increment the usage count. This is used to create entries in * clip_setentry. */ static int clip_encap(struct atm_vcc *vcc, int mode) { if (!CLIP_VCC(vcc)) return -EBADFD; CLIP_VCC(vcc)->encap = mode; return 0; } static netdev_tx_t clip_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct clip_priv *clip_priv = PRIV(dev); struct dst_entry *dst = skb_dst(skb); struct atmarp_entry *entry; struct neighbour *n; struct atm_vcc *vcc; struct rtable *rt; __be32 *daddr; int old; unsigned long flags; pr_debug("(skb %p)\n", skb); if (!dst) { pr_err("skb_dst(skb) == NULL\n"); dev_kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } rt = (struct rtable *) dst; if (rt->rt_gw_family == AF_INET) daddr = &rt->rt_gw4; else daddr = &ip_hdr(skb)->daddr; n = dst_neigh_lookup(dst, daddr); if (!n) { pr_err("NO NEIGHBOUR !\n"); dev_kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } entry = neighbour_priv(n); if (!entry->vccs) { if (time_after(jiffies, entry->expires)) { /* should be resolved */ entry->expires = jiffies + ATMARP_RETRY_DELAY * HZ; to_atmarpd(act_need, PRIV(dev)->number, *((__be32 *)n->primary_key)); } if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS) skb_queue_tail(&entry->neigh->arp_queue, skb); else { dev_kfree_skb(skb); dev->stats.tx_dropped++; } goto out_release_neigh; } pr_debug("neigh %p, vccs %p\n", entry, entry->vccs); ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc; pr_debug("using neighbour %p, vcc %p\n", n, vcc); if (entry->vccs->encap) { void *here; here = skb_push(skb, RFC1483LLC_LEN); memcpy(here, llc_oui, sizeof(llc_oui)); ((__be16 *) here)[3] = skb->protocol; } atm_account_tx(vcc, skb); entry->vccs->last_use = jiffies; pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev); old = xchg(&entry->vccs->xoff, 1); /* assume XOFF ... */ if (old) { pr_warn("XOFF->XOFF transition\n"); goto out_release_neigh; } dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; vcc->send(vcc, skb); if (atm_may_send(vcc, 0)) { entry->vccs->xoff = 0; goto out_release_neigh; } spin_lock_irqsave(&clip_priv->xoff_lock, flags); netif_stop_queue(dev); /* XOFF -> throttle immediately */ barrier(); if (!entry->vccs->xoff) netif_start_queue(dev); /* Oh, we just raced with clip_pop. netif_start_queue should be good enough, because nothing should really be asleep because of the brief netif_stop_queue. If this isn't true or if it changes, use netif_wake_queue instead. */ spin_unlock_irqrestore(&clip_priv->xoff_lock, flags); out_release_neigh: neigh_release(n); return NETDEV_TX_OK; } static int clip_mkip(struct atm_vcc *vcc, int timeout) { struct clip_vcc *clip_vcc; if (!vcc->push) return -EBADFD; clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL); if (!clip_vcc) return -ENOMEM; pr_debug("%p vcc %p\n", clip_vcc, vcc); clip_vcc->vcc = vcc; vcc->user_back = clip_vcc; set_bit(ATM_VF_IS_CLIP, &vcc->flags); clip_vcc->entry = NULL; clip_vcc->xoff = 0; clip_vcc->encap = 1; clip_vcc->last_use = jiffies; clip_vcc->idle_timeout = timeout * HZ; clip_vcc->old_push = vcc->push; clip_vcc->old_pop = vcc->pop; vcc->push = clip_push; vcc->pop = clip_pop; /* re-process everything received between connection setup and MKIP */ vcc_process_recv_queue(vcc); return 0; } static int clip_setentry(struct atm_vcc *vcc, __be32 ip) { struct neighbour *neigh; struct atmarp_entry *entry; int error; struct clip_vcc *clip_vcc; struct rtable *rt; if (vcc->push != clip_push) { pr_warn("non-CLIP VCC\n"); return -EBADF; } clip_vcc = CLIP_VCC(vcc); if (!ip) { if (!clip_vcc->entry) { pr_err("hiding hidden ATMARP entry\n"); return 0; } pr_debug("remove\n"); unlink_clip_vcc(clip_vcc); return 0; } rt = ip_route_output(&init_net, ip, 0, 1, 0); if (IS_ERR(rt)) return PTR_ERR(rt); neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1); ip_rt_put(rt); if (!neigh) return -ENOMEM; entry = neighbour_priv(neigh); if (entry != clip_vcc->entry) { if (!clip_vcc->entry) pr_debug("add\n"); else { pr_debug("update\n"); unlink_clip_vcc(clip_vcc); } link_vcc(clip_vcc, entry); } error = neigh_update(neigh, llc_oui, NUD_PERMANENT, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); return error; } static const struct net_device_ops clip_netdev_ops = { .ndo_start_xmit = clip_start_xmit, .ndo_neigh_construct = clip_constructor, }; static void clip_setup(struct net_device *dev) { dev->netdev_ops = &clip_netdev_ops; dev->type = ARPHRD_ATM; dev->neigh_priv_len = sizeof(struct atmarp_entry); dev->hard_header_len = RFC1483LLC_LEN; dev->mtu = RFC1626_MTU; dev->tx_queue_len = 100; /* "normal" queue (packets) */ /* When using a "real" qdisc, the qdisc determines the queue */ /* length. tx_queue_len is only used for the default case, */ /* without any more elaborate queuing. 100 is a reasonable */ /* compromise between decent burst-tolerance and protection */ /* against memory hogs. */ netif_keep_dst(dev); } static int clip_create(int number) { struct net_device *dev; struct clip_priv *clip_priv; int error; if (number != -1) { for (dev = clip_devs; dev; dev = PRIV(dev)->next) if (PRIV(dev)->number == number) return -EEXIST; } else { number = 0; for (dev = clip_devs; dev; dev = PRIV(dev)->next) if (PRIV(dev)->number >= number) number = PRIV(dev)->number + 1; } dev = alloc_netdev(sizeof(struct clip_priv), "", NET_NAME_UNKNOWN, clip_setup); if (!dev) return -ENOMEM; clip_priv = PRIV(dev); sprintf(dev->name, "atm%d", number); spin_lock_init(&clip_priv->xoff_lock); clip_priv->number = number; error = register_netdev(dev); if (error) { free_netdev(dev); return error; } clip_priv->next = clip_devs; clip_devs = dev; pr_debug("registered (net:%s)\n", dev->name); return number; } static int clip_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) return NOTIFY_DONE; /* ignore non-CLIP devices */ if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops) return NOTIFY_DONE; switch (event) { case NETDEV_UP: pr_debug("NETDEV_UP\n"); to_atmarpd(act_up, PRIV(dev)->number, 0); break; case NETDEV_GOING_DOWN: pr_debug("NETDEV_DOWN\n"); to_atmarpd(act_down, PRIV(dev)->number, 0); break; case NETDEV_CHANGE: case NETDEV_CHANGEMTU: pr_debug("NETDEV_CHANGE*\n"); to_atmarpd(act_change, PRIV(dev)->number, 0); break; } return NOTIFY_DONE; } static int clip_inet_event(struct notifier_block *this, unsigned long event, void *ifa) { struct in_device *in_dev; struct netdev_notifier_info info; in_dev = ((struct in_ifaddr *)ifa)->ifa_dev; /* * Transitions are of the down-change-up type, so it's sufficient to * handle the change on up. */ if (event != NETDEV_UP) return NOTIFY_DONE; netdev_notifier_info_init(&info, in_dev->dev); return clip_device_event(this, NETDEV_CHANGE, &info); } static struct notifier_block clip_dev_notifier = { .notifier_call = clip_device_event, }; static struct notifier_block clip_inet_notifier = { .notifier_call = clip_inet_event, }; static void atmarpd_close(struct atm_vcc *vcc) { pr_debug("\n"); rtnl_lock(); atmarpd = NULL; skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); rtnl_unlock(); pr_debug("(done)\n"); module_put(THIS_MODULE); } static const struct atmdev_ops atmarpd_dev_ops = { .close = atmarpd_close }; static struct atm_dev atmarpd_dev = { .ops = &atmarpd_dev_ops, .type = "arpd", .number = 999, .lock = __SPIN_LOCK_UNLOCKED(atmarpd_dev.lock) }; static int atm_init_atmarp(struct atm_vcc *vcc) { rtnl_lock(); if (atmarpd) { rtnl_unlock(); return -EADDRINUSE; } mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); atmarpd = vcc; set_bit(ATM_VF_META, &vcc->flags); set_bit(ATM_VF_READY, &vcc->flags); /* allow replies and avoid getting closed if signaling dies */ vcc->dev = &atmarpd_dev; vcc_insert_socket(sk_atm(vcc)); vcc->push = NULL; vcc->pop = NULL; /* crash */ vcc->push_oam = NULL; /* crash */ rtnl_unlock(); return 0; } static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct atm_vcc *vcc = ATM_SD(sock); int err = 0; switch (cmd) { case SIOCMKCLIP: case ATMARPD_CTRL: case ATMARP_MKIP: case ATMARP_SETENTRY: case ATMARP_ENCAP: if (!capable(CAP_NET_ADMIN)) return -EPERM; break; default: return -ENOIOCTLCMD; } switch (cmd) { case SIOCMKCLIP: err = clip_create(arg); break; case ATMARPD_CTRL: err = atm_init_atmarp(vcc); if (!err) { sock->state = SS_CONNECTED; __module_get(THIS_MODULE); } break; case ATMARP_MKIP: err = clip_mkip(vcc, arg); break; case ATMARP_SETENTRY: err = clip_setentry(vcc, (__force __be32)arg); break; case ATMARP_ENCAP: err = clip_encap(vcc, arg); break; } return err; } static struct atm_ioctl clip_ioctl_ops = { .owner = THIS_MODULE, .ioctl = clip_ioctl, }; #ifdef CONFIG_PROC_FS static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr) { static int code[] = { 1, 2, 10, 6, 1, 0 }; static int e164[] = { 1, 8, 4, 6, 1, 0 }; if (*addr->sas_addr.pub) { seq_printf(seq, "%s", addr->sas_addr.pub); if (*addr->sas_addr.prv) seq_putc(seq, '+'); } else if (!*addr->sas_addr.prv) { seq_printf(seq, "%s", "(none)"); return; } if (*addr->sas_addr.prv) { unsigned char *prv = addr->sas_addr.prv; int *fields; int i, j; fields = *prv == ATM_AFI_E164 ? e164 : code; for (i = 0; fields[i]; i++) { for (j = fields[i]; j; j--) seq_printf(seq, "%02X", *prv++); if (fields[i + 1]) seq_putc(seq, '.'); } } } /* This means the neighbour entry has no attached VCC objects. */ #define SEQ_NO_VCC_TOKEN ((void *) 2) static void atmarp_info(struct seq_file *seq, struct neighbour *n, struct atmarp_entry *entry, struct clip_vcc *clip_vcc) { struct net_device *dev = n->dev; unsigned long exp; char buf[17]; int svc, llc, off; svc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || (sk_atm(clip_vcc->vcc)->sk_family == AF_ATMSVC)); llc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || clip_vcc->encap); if (clip_vcc == SEQ_NO_VCC_TOKEN) exp = entry->neigh->used; else exp = clip_vcc->last_use; exp = (jiffies - exp) / HZ; seq_printf(seq, "%-6s%-4s%-4s%5ld ", dev->name, svc ? "SVC" : "PVC", llc ? "LLC" : "NULL", exp); off = scnprintf(buf, sizeof(buf) - 1, "%pI4", n->primary_key); while (off < 16) buf[off++] = ' '; buf[off] = '\0'; seq_printf(seq, "%s", buf); if (clip_vcc == SEQ_NO_VCC_TOKEN) { if (time_before(jiffies, entry->expires)) seq_printf(seq, "(resolving)\n"); else seq_printf(seq, "(expired, ref %d)\n", refcount_read(&entry->neigh->refcnt)); } else if (!svc) { seq_printf(seq, "%d.%d.%d\n", clip_vcc->vcc->dev->number, clip_vcc->vcc->vpi, clip_vcc->vcc->vci); } else { svc_addr(seq, &clip_vcc->vcc->remote); seq_putc(seq, '\n'); } } struct clip_seq_state { /* This member must be first. */ struct neigh_seq_state ns; /* Local to clip specific iteration. */ struct clip_vcc *vcc; }; static struct clip_vcc *clip_seq_next_vcc(struct atmarp_entry *e, struct clip_vcc *curr) { if (!curr) { curr = e->vccs; if (!curr) return SEQ_NO_VCC_TOKEN; return curr; } if (curr == SEQ_NO_VCC_TOKEN) return NULL; curr = curr->next; return curr; } static void *clip_seq_vcc_walk(struct clip_seq_state *state, struct atmarp_entry *e, loff_t * pos) { struct clip_vcc *vcc = state->vcc; vcc = clip_seq_next_vcc(e, vcc); if (vcc && pos != NULL) { while (*pos) { vcc = clip_seq_next_vcc(e, vcc); if (!vcc) break; --(*pos); } } state->vcc = vcc; return vcc; } static void *clip_seq_sub_iter(struct neigh_seq_state *_state, struct neighbour *n, loff_t * pos) { struct clip_seq_state *state = (struct clip_seq_state *)_state; if (n->dev->type != ARPHRD_ATM) return NULL; return clip_seq_vcc_walk(state, neighbour_priv(n), pos); } static void *clip_seq_start(struct seq_file *seq, loff_t * pos) { struct clip_seq_state *state = seq->private; state->ns.neigh_sub_iter = clip_seq_sub_iter; return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_NEIGH_ONLY); } static int clip_seq_show(struct seq_file *seq, void *v) { static char atm_arp_banner[] = "IPitf TypeEncp Idle IP address ATM address\n"; if (v == SEQ_START_TOKEN) { seq_puts(seq, atm_arp_banner); } else { struct clip_seq_state *state = seq->private; struct clip_vcc *vcc = state->vcc; struct neighbour *n = v; atmarp_info(seq, n, neighbour_priv(n), vcc); } return 0; } static const struct seq_operations arp_seq_ops = { .start = clip_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = clip_seq_show, }; #endif static void atm_clip_exit_noproc(void); static int __init atm_clip_init(void) { register_atm_ioctl(&clip_ioctl_ops); register_netdevice_notifier(&clip_dev_notifier); register_inetaddr_notifier(&clip_inet_notifier); timer_setup(&idle_timer, idle_timer_check, 0); #ifdef CONFIG_PROC_FS { struct proc_dir_entry *p; p = proc_create_net("arp", 0444, atm_proc_root, &arp_seq_ops, sizeof(struct clip_seq_state)); if (!p) { pr_err("Unable to initialize /proc/net/atm/arp\n"); atm_clip_exit_noproc(); return -ENOMEM; } } #endif return 0; } static void atm_clip_exit_noproc(void) { struct net_device *dev, *next; unregister_inetaddr_notifier(&clip_inet_notifier); unregister_netdevice_notifier(&clip_dev_notifier); deregister_atm_ioctl(&clip_ioctl_ops); /* First, stop the idle timer, so it stops banging * on the table. */ del_timer_sync(&idle_timer); dev = clip_devs; while (dev) { next = PRIV(dev)->next; unregister_netdev(dev); free_netdev(dev); dev = next; } } static void __exit atm_clip_exit(void) { remove_proc_entry("arp", atm_proc_root); atm_clip_exit_noproc(); } module_init(atm_clip_init); module_exit(atm_clip_exit); MODULE_AUTHOR("Werner Almesberger"); MODULE_DESCRIPTION("Classical/IP over ATM interface"); MODULE_LICENSE("GPL"); |
50 50 50 50 50 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_app.c: Application module support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference * is that ip_vs_app module handles the reverse direction (incoming requests * and outgoing responses). * * IP_MASQ_APP application masquerading module * * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/tcp.h> #include <linux/stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/mutex.h> #include <net/ip_vs.h> EXPORT_SYMBOL(register_ip_vs_app); EXPORT_SYMBOL(unregister_ip_vs_app); EXPORT_SYMBOL(register_ip_vs_app_inc); static DEFINE_MUTEX(__ip_vs_app_mutex); /* * Get an ip_vs_app object */ static inline int ip_vs_app_get(struct ip_vs_app *app) { return try_module_get(app->module); } static inline void ip_vs_app_put(struct ip_vs_app *app) { module_put(app->module); } static void ip_vs_app_inc_destroy(struct ip_vs_app *inc) { kfree(inc->timeout_table); kfree(inc); } static void ip_vs_app_inc_rcu_free(struct rcu_head *head) { struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head); ip_vs_app_inc_destroy(inc); } /* * Allocate/initialize app incarnation and register it in proto apps. */ static int ip_vs_app_inc_new(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port) { struct ip_vs_protocol *pp; struct ip_vs_app *inc; int ret; if (!(pp = ip_vs_proto_get(proto))) return -EPROTONOSUPPORT; if (!pp->unregister_app) return -EOPNOTSUPP; inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); if (!inc) return -ENOMEM; INIT_LIST_HEAD(&inc->p_list); INIT_LIST_HEAD(&inc->incs_list); inc->app = app; inc->port = htons(port); atomic_set(&inc->usecnt, 0); if (app->timeouts) { inc->timeout_table = ip_vs_create_timeout_table(app->timeouts, app->timeouts_size); if (!inc->timeout_table) { ret = -ENOMEM; goto out; } } ret = pp->register_app(ipvs, inc); if (ret) goto out; list_add(&inc->a_list, &app->incs_list); IP_VS_DBG(9, "%s App %s:%u registered\n", pp->name, inc->name, ntohs(inc->port)); return 0; out: ip_vs_app_inc_destroy(inc); return ret; } /* * Release app incarnation */ static void ip_vs_app_inc_release(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_protocol *pp; if (!(pp = ip_vs_proto_get(inc->protocol))) return; if (pp->unregister_app) pp->unregister_app(ipvs, inc); IP_VS_DBG(9, "%s App %s:%u unregistered\n", pp->name, inc->name, ntohs(inc->port)); list_del(&inc->a_list); call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free); } /* * Get reference to app inc (only called from softirq) * */ int ip_vs_app_inc_get(struct ip_vs_app *inc) { int result; result = ip_vs_app_get(inc->app); if (result) atomic_inc(&inc->usecnt); return result; } /* * Put the app inc (only called from timer or net softirq) */ void ip_vs_app_inc_put(struct ip_vs_app *inc) { atomic_dec(&inc->usecnt); ip_vs_app_put(inc->app); } /* * Register an application incarnation in protocol applications */ int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port) { int result; mutex_lock(&__ip_vs_app_mutex); result = ip_vs_app_inc_new(ipvs, app, proto, port); mutex_unlock(&__ip_vs_app_mutex); return result; } /* Register application for netns */ struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app) { struct ip_vs_app *a; int err = 0; mutex_lock(&__ip_vs_app_mutex); /* increase the module use count */ if (!ip_vs_use_count_inc()) { err = -ENOENT; goto out_unlock; } list_for_each_entry(a, &ipvs->app_list, a_list) { if (!strcmp(app->name, a->name)) { err = -EEXIST; /* decrease the module use count */ ip_vs_use_count_dec(); goto out_unlock; } } a = kmemdup(app, sizeof(*app), GFP_KERNEL); if (!a) { err = -ENOMEM; /* decrease the module use count */ ip_vs_use_count_dec(); goto out_unlock; } INIT_LIST_HEAD(&a->incs_list); list_add(&a->a_list, &ipvs->app_list); out_unlock: mutex_unlock(&__ip_vs_app_mutex); return err ? ERR_PTR(err) : a; } /* * ip_vs_app unregistration routine * We are sure there are no app incarnations attached to services * Caller should use synchronize_rcu() or rcu_barrier() */ void unregister_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app) { struct ip_vs_app *a, *anxt, *inc, *nxt; mutex_lock(&__ip_vs_app_mutex); list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) { if (app && strcmp(app->name, a->name)) continue; list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) { ip_vs_app_inc_release(ipvs, inc); } list_del(&a->a_list); kfree(a); /* decrease the module use count */ ip_vs_use_count_dec(); } mutex_unlock(&__ip_vs_app_mutex); } /* * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) */ int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { return pp->app_conn_bind(cp); } /* * Unbind cp from application incarnation (called by cp destructor) */ void ip_vs_unbind_app(struct ip_vs_conn *cp) { struct ip_vs_app *inc = cp->app; if (!inc) return; if (inc->unbind_conn) inc->unbind_conn(inc, cp); if (inc->done_conn) inc->done_conn(inc, cp); ip_vs_app_inc_put(inc); cp->app = NULL; } /* * Fixes th->seq based on ip_vs_seq info. */ static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) { __u32 seq = ntohl(th->seq); /* * Adjust seq with delta-offset for all packets after * the most recent resized pkt seq and with previous_delta offset * for all packets before most recent resized pkt seq. */ if (vseq->delta || vseq->previous_delta) { if(after(seq, vseq->init_seq)) { th->seq = htonl(seq + vseq->delta); IP_VS_DBG(9, "%s(): added delta (%d) to seq\n", __func__, vseq->delta); } else { th->seq = htonl(seq + vseq->previous_delta); IP_VS_DBG(9, "%s(): added previous_delta (%d) to seq\n", __func__, vseq->previous_delta); } } } /* * Fixes th->ack_seq based on ip_vs_seq info. */ static inline void vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) { __u32 ack_seq = ntohl(th->ack_seq); /* * Adjust ack_seq with delta-offset for * the packets AFTER most recent resized pkt has caused a shift * for packets before most recent resized pkt, use previous_delta */ if (vseq->delta || vseq->previous_delta) { /* since ack_seq is the number of octet that is expected to receive next, so compare it with init_seq+delta */ if(after(ack_seq, vseq->init_seq+vseq->delta)) { th->ack_seq = htonl(ack_seq - vseq->delta); IP_VS_DBG(9, "%s(): subtracted delta " "(%d) from ack_seq\n", __func__, vseq->delta); } else { th->ack_seq = htonl(ack_seq - vseq->previous_delta); IP_VS_DBG(9, "%s(): subtracted " "previous_delta (%d) from ack_seq\n", __func__, vseq->previous_delta); } } } /* * Updates ip_vs_seq if pkt has been resized * Assumes already checked proto==IPPROTO_TCP and diff!=0. */ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, unsigned int flag, __u32 seq, int diff) { /* spinlock is to keep updating cp->flags atomic */ spin_lock_bh(&cp->lock); if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { vseq->previous_delta = vseq->delta; vseq->delta += diff; vseq->init_seq = seq; cp->flags |= flag; } spin_unlock_bh(&cp->lock); } static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, struct ip_vs_app *app, struct ip_vs_iphdr *ipvsh) { int diff; const unsigned int tcp_offset = ip_hdrlen(skb); struct tcphdr *th; __u32 seq; if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) return 0; th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); /* * Remember seq number in case this pkt gets resized */ seq = ntohl(th->seq); /* * Fix seq stuff if flagged as so. */ if (cp->flags & IP_VS_CONN_F_OUT_SEQ) vs_fix_seq(&cp->out_seq, th); if (cp->flags & IP_VS_CONN_F_IN_SEQ) vs_fix_ack_seq(&cp->in_seq, th); /* * Call private output hook function */ if (app->pkt_out == NULL) return 1; if (!app->pkt_out(app, cp, skb, &diff, ipvsh)) return 0; /* * Update ip_vs seq stuff if len has changed. */ if (diff != 0) vs_seq_update(cp, &cp->out_seq, IP_VS_CONN_F_OUT_SEQ, seq, diff); return 1; } /* * Output pkt hook. Will call bound ip_vs_app specific function * called by ipvs packet handler, assumes previously checked cp!=NULL * returns false if it can't handle packet (oom) */ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, struct ip_vs_iphdr *ipvsh) { struct ip_vs_app *app; /* * check if application module is bound to * this ip_vs_conn. */ if ((app = cp->app) == NULL) return 1; /* TCP is complicated */ if (cp->protocol == IPPROTO_TCP) return app_tcp_pkt_out(cp, skb, app, ipvsh); /* * Call private output hook function */ if (app->pkt_out == NULL) return 1; return app->pkt_out(app, cp, skb, NULL, ipvsh); } static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, struct ip_vs_app *app, struct ip_vs_iphdr *ipvsh) { int diff; const unsigned int tcp_offset = ip_hdrlen(skb); struct tcphdr *th; __u32 seq; if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) return 0; th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); /* * Remember seq number in case this pkt gets resized */ seq = ntohl(th->seq); /* * Fix seq stuff if flagged as so. */ if (cp->flags & IP_VS_CONN_F_IN_SEQ) vs_fix_seq(&cp->in_seq, th); if (cp->flags & IP_VS_CONN_F_OUT_SEQ) vs_fix_ack_seq(&cp->out_seq, th); /* * Call private input hook function */ if (app->pkt_in == NULL) return 1; if (!app->pkt_in(app, cp, skb, &diff, ipvsh)) return 0; /* * Update ip_vs seq stuff if len has changed. */ if (diff != 0) vs_seq_update(cp, &cp->in_seq, IP_VS_CONN_F_IN_SEQ, seq, diff); return 1; } /* * Input pkt hook. Will call bound ip_vs_app specific function * called by ipvs packet handler, assumes previously checked cp!=NULL. * returns false if can't handle packet (oom). */ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, struct ip_vs_iphdr *ipvsh) { struct ip_vs_app *app; /* * check if application module is bound to * this ip_vs_conn. */ if ((app = cp->app) == NULL) return 1; /* TCP is complicated */ if (cp->protocol == IPPROTO_TCP) return app_tcp_pkt_in(cp, skb, app, ipvsh); /* * Call private input hook function */ if (app->pkt_in == NULL) return 1; return app->pkt_in(app, cp, skb, NULL, ipvsh); } #ifdef CONFIG_PROC_FS /* * /proc/net/ip_vs_app entry function */ static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos) { struct ip_vs_app *app, *inc; list_for_each_entry(app, &ipvs->app_list, a_list) { list_for_each_entry(inc, &app->incs_list, a_list) { if (pos-- == 0) return inc; } } return NULL; } static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_app_mutex); return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_vs_app *inc, *app; struct list_head *e; struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); ++*pos; if (v == SEQ_START_TOKEN) return ip_vs_app_idx(ipvs, 0); inc = v; app = inc->app; if ((e = inc->a_list.next) != &app->incs_list) return list_entry(e, struct ip_vs_app, a_list); /* go on to next application */ for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) { app = list_entry(e, struct ip_vs_app, a_list); list_for_each_entry(inc, &app->incs_list, a_list) { return inc; } } return NULL; } static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) { mutex_unlock(&__ip_vs_app_mutex); } static int ip_vs_app_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "prot port usecnt name\n"); else { const struct ip_vs_app *inc = v; seq_printf(seq, "%-3s %-7u %-6d %-17s\n", ip_vs_proto_name(inc->protocol), ntohs(inc->port), atomic_read(&inc->usecnt), inc->name); } return 0; } static const struct seq_operations ip_vs_app_seq_ops = { .start = ip_vs_app_seq_start, .next = ip_vs_app_seq_next, .stop = ip_vs_app_seq_stop, .show = ip_vs_app_seq_show, }; #endif int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { INIT_LIST_HEAD(&ipvs->app_list); #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; #endif return 0; } void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { unregister_ip_vs_app(ipvs, NULL /* all */); #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_app", ipvs->net->proc_net); #endif } |
4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 | // SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 2012 Taobao. * Written by Tao Ma <boyu.mt@taobao.com> */ #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/iversion.h> #include <linux/backing-dev.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "truncate.h" #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) return EXT4_I(inode)->i_inline_size; return 0; } static int get_max_inline_xattr_value_size(struct inode *inode, struct ext4_iloc *iloc) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; void *end; int free, min_offs; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return 0; min_offs = EXT4_SB(inode->i_sb)->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE - EXT4_I(inode)->i_extra_isize - sizeof(struct ext4_xattr_ibody_header); /* * We need to subtract another sizeof(__u32) since an in-inode xattr * needs an empty 4 bytes to indicate the gap between the xattr entry * and the name/value pair. */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return EXT4_XATTR_SIZE(min_offs - EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - EXT4_XATTR_ROUND - sizeof(__u32)); raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; /* Compute min_offs. */ while (!IS_LAST_ENTRY(entry)) { void *next = EXT4_XATTR_NEXT(entry); if (next >= end) { EXT4_ERROR_INODE(inode, "corrupt xattr in inline inode"); return 0; } if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; } entry = next; } free = min_offs - ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); if (EXT4_I(inode)->i_inline_off) { entry = (struct ext4_xattr_entry *) ((void *)raw_inode + EXT4_I(inode)->i_inline_off); free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); goto out; } free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); if (free > EXT4_XATTR_ROUND) free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); else free = 0; out: return free; } /* * Get the maximum size we now can store in an inode. * If we can't find the space for a xattr entry, don't use the space * of the extents since we have no space to indicate the inline data. */ int ext4_get_max_inline_size(struct inode *inode) { int error, max_inline_size; struct ext4_iloc iloc; if (EXT4_I(inode)->i_extra_isize == 0) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) { ext4_error_inode_err(inode, __func__, __LINE__, 0, -error, "can't get inode location %lu", inode->i_ino); return 0; } down_read(&EXT4_I(inode)->xattr_sem); max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); up_read(&EXT4_I(inode)->xattr_sem); brelse(iloc.bh); if (!max_inline_size) return 0; return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; } /* * this function does not take xattr_sem, which is OK because it is * currently only used in a code path coming form ext4_iget, before * the new inode has been unlocked */ int ext4_find_inline_data_nolock(struct inode *inode) { struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; int error; if (EXT4_I(inode)->i_extra_isize == 0) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; if (!is.s.not_found) { if (is.s.here->e_value_inum) { EXT4_ERROR_INODE(inode, "inline data xattr refers " "to an external xattr inode"); error = -EFSCORRUPTED; goto out; } EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + le32_to_cpu(is.s.here->e_value_size); } out: brelse(is.iloc.bh); return error; } static int ext4_read_inline_data(struct inode *inode, void *buffer, unsigned int len, struct ext4_iloc *iloc) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; int cp_len = 0; struct ext4_inode *raw_inode; if (!len) return 0; BUG_ON(len > EXT4_I(inode)->i_inline_size); cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? len : EXT4_MIN_INLINE_DATA_SIZE; raw_inode = ext4_raw_inode(iloc); memcpy(buffer, (void *)(raw_inode->i_block), cp_len); len -= cp_len; buffer += cp_len; if (!len) goto out; header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); len = min_t(unsigned int, len, (unsigned int)le32_to_cpu(entry->e_value_size)); memcpy(buffer, (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); cp_len += len; out: return cp_len; } /* * write the buffer to the inline inode. * If 'create' is set, we don't need to do the extra copy in the xattr * value since it is already handled by ext4_xattr_ibody_set. * That saves us one memcpy. */ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, void *buffer, loff_t pos, unsigned int len) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; int cp_len = 0; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); raw_inode = ext4_raw_inode(iloc); buffer += pos; if (pos < EXT4_MIN_INLINE_DATA_SIZE) { cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? EXT4_MIN_INLINE_DATA_SIZE - pos : len; memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); len -= cp_len; buffer += cp_len; pos += cp_len; } if (!len) return; pos -= EXT4_MIN_INLINE_DATA_SIZE; header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, buffer, len); } static int ext4_create_inline_data(handle_t *handle, struct inode *inode, unsigned len) { int error; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; if (len > EXT4_MIN_INLINE_DATA_SIZE) { value = EXT4_ZERO_XATTR_VALUE; len -= EXT4_MIN_INLINE_DATA_SIZE; } else { value = ""; len = 0; } /* Insert the xttr entry. */ i.value = value; i.value_len = len; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUG_ON(!is.s.not_found); error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) { if (error == -ENOSPC) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); goto out; } memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); out: brelse(is.iloc.bh); return error; } static int ext4_update_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { int error; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; /* If the old space is ok, write the data directly. */ if (len <= EXT4_I(inode)->i_inline_size) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUG_ON(is.s.not_found); len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); if (!value) { error = -ENOMEM; goto out; } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); if (error < 0) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; /* Update the xattr entry. */ i.value = value; i.value_len = len; error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + le32_to_cpu(is.s.here->e_value_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); out: kfree(value); brelse(is.iloc.bh); return error; } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return -ENOSPC; size = ext4_get_max_inline_size(inode); if (size < len) return -ENOSPC; ext4_write_lock_xattr(inode, &no_expand); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else ret = ext4_create_inline_data(handle, inode, len); ext4_write_unlock_xattr(inode, &no_expand); return ret; } static int ext4_destroy_inline_data_nolock(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_xattr_ibody_find is = { .s = { .not_found = 0, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, .value = NULL, .value_len = 0, }; int error; if (!ei->i_inline_off) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, EXT4_JTR_NONE); if (error) goto out; error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE); if (ext4_has_feature_extents(inode->i_sb)) { if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); EXT4_I(inode)->i_inline_off = 0; EXT4_I(inode)->i_inline_size = 0; ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); out: brelse(is.iloc.bh); if (error == -ENODATA) error = 0; return error; } static int ext4_read_inline_page(struct inode *inode, struct page *page) { void *kaddr; int ret = 0; size_t len; struct ext4_iloc iloc; BUG_ON(!PageLocked(page)); BUG_ON(!ext4_has_inline_data(inode)); BUG_ON(page->index); if (!EXT4_I(inode)->i_inline_off) { ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", inode->i_ino); goto out; } ret = ext4_get_inode_loc(inode, &iloc); if (ret) goto out; len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); kaddr = kmap_atomic(page); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); flush_dcache_page(page); kunmap_atomic(kaddr); zero_user_segment(page, len, PAGE_SIZE); SetPageUptodate(page); brelse(iloc.bh); out: return ret; } int ext4_readpage_inline(struct inode *inode, struct page *page) { int ret = 0; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); return -EAGAIN; } /* * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. */ if (!page->index) ret = ext4_read_inline_page(inode, page); else if (!PageUptodate(page)) { zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); return ret >= 0 ? 0 : ret; } static int ext4_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, unsigned flags) { int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; struct page *page = NULL; unsigned from, to; struct ext4_iloc iloc; if (!ext4_has_inline_data(inode)) { /* * clear the flag so that no new write * will trap here again. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; goto out; } /* We cannot recurse into the filesystem as the transaction is already * started */ flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; goto out; } ext4_write_lock_xattr(inode, &no_expand); sem_held = 1; /* If some one has already done this for us, just exit. */ if (!ext4_has_inline_data(inode)) { ret = 0; goto out; } from = 0; to = ext4_get_inline_size(inode); if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); if (ret < 0) goto out; } ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; if (ext4_should_dioread_nolock(inode)) { ret = __block_write_begin(page, from, to, ext4_get_block_unwritten); } else ret = __block_write_begin(page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), from, to, NULL, do_journal_get_write_access); } if (ret) { unlock_page(page); put_page(page); page = NULL; ext4_orphan_add(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; ext4_journal_stop(handle); handle = NULL; ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; if (page) block_commit_write(page, from, to); out: if (page) { unlock_page(page); put_page(page); } if (sem_held) ext4_write_unlock_xattr(inode, &no_expand); if (handle) ext4_journal_stop(handle); brelse(iloc.bh); return ret; } /* * Try to write data in the inode. * If the inode has inline data, check whether the new write can be * in the inode also. If not, create the page the handle, move the data * to the page make it update and let the later codes create extent for it. */ int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep) { int ret; handle_t *handle; struct page *page; struct ext4_iloc iloc; if (pos + len > ext4_get_max_inline_size(inode)) goto convert; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; /* * The possible write could happen in the inode, * so try to reserve the space in inode first. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; goto out; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out; /* We don't have space in inline inode, so convert it to extent. */ if (ret == -ENOSPC) { ext4_journal_stop(handle); brelse(iloc.bh); goto convert; } ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (ret) goto out; flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; goto out; } *pagep = page; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; unlock_page(page); put_page(page); goto out_up_read; } if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); if (ret < 0) { unlock_page(page); put_page(page); goto out_up_read; } } ret = 1; handle = NULL; out_up_read: up_read(&EXT4_I(inode)->xattr_sem); out: if (handle && (ret != 1)) ext4_journal_stop(handle); brelse(iloc.bh); return ret; convert: return ext4_convert_inline_data_to_extent(mapping, inode, flags); } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { handle_t *handle = ext4_journal_current_handle(); int no_expand; void *kaddr; struct ext4_iloc iloc; int ret = 0, ret2; if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; if (likely(copied)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) { unlock_page(page); put_page(page); ext4_std_error(inode->i_sb, ret); goto out; } ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); /* * ei->i_inline_off may have changed since * ext4_write_begin() called * ext4_try_to_write_inline_data() */ (void) ext4_find_inline_data_nolock(inode); kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); kunmap_atomic(kaddr); SetPageUptodate(page); /* clear page dirty so that writepages wouldn't work for us. */ ClearPageDirty(page); ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); /* * It's important to update i_size while still holding page * lock: page writeout could otherwise come in and zero * beyond i_size. */ ext4_update_inode_size(inode, pos + copied); } unlock_page(page); put_page(page); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock * ordering of page lock and transaction start for journaling * filesystems. */ if (likely(copied)) mark_inode_dirty(inode); out: /* * If we didn't copy as much data as expected, we need to trim back * size of xattr containing inline data. */ if (pos + len > inode->i_size && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } struct buffer_head * ext4_journalled_write_inline_data(struct inode *inode, unsigned len, struct page *page) { int ret, no_expand; void *kaddr; struct ext4_iloc iloc; ret = ext4_get_inode_loc(inode, &iloc); if (ret) { ext4_std_error(inode->i_sb, ret); return NULL; } ext4_write_lock_xattr(inode, &no_expand); kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, 0, len); kunmap_atomic(kaddr); ext4_write_unlock_xattr(inode, &no_expand); return iloc.bh; } /* * Try to make the page cache and handle ready for the inline data case. * We can call this function in 2 cases: * 1. The inode is created and the first write exceeds inline size. We can * clear the inode state safely. * 2. The inode has inline data, then we need to read the data, make it * update and dirty so that ext4_da_writepages can handle it. We don't * need to start the journal since the file's metadata isn't changed now. */ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, unsigned flags, void **fsdata) { int ret = 0, inline_size; struct page *page; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) return -ENOMEM; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); goto out; } inline_size = ext4_get_inline_size(inode); if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); if (ret < 0) goto out; } ret = __block_write_begin(page, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); put_page(page); ext4_truncate_failed_write(inode); return ret; } SetPageDirty(page); SetPageUptodate(page); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); *fsdata = (void *)CONVERT_INLINE_DATA; out: up_read(&EXT4_I(inode)->xattr_sem); if (page) { unlock_page(page); put_page(page); } return ret; } /* * Prepare the write for the inline data. * If the data can be written into the inode, we just read * the page and make it uptodate, and start the journal. * Otherwise read the page, makes it dirty so that it can be * handle in writepages(the i_disksize update is left to the * normal ext4_da_write_end). */ int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { int ret, inline_size; handle_t *handle; struct page *page; struct ext4_iloc iloc; int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } inline_size = ext4_get_max_inline_size(inode); ret = -ENOSPC; if (inline_size >= pos + len) { ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out_journal; } /* * We cannot recurse into the filesystem as the transaction * is already started. */ flags |= AOP_FLAG_NOFS; if (ret == -ENOSPC) { ext4_journal_stop(handle); ret = ext4_da_convert_inline_data_to_extent(mapping, inode, flags, fsdata); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; goto out; } page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; goto out_journal; } down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; goto out_release_page; } if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); if (ret < 0) goto out_release_page; } ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (ret) goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); *pagep = page; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); put_page(page); out_journal: ext4_journal_stop(handle); out: brelse(iloc.bh); return ret; } #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) { int offset; unsigned short de_len; struct ext4_dir_entry_2 *de = inline_start; void *dlimit = inline_start + inline_size; trace_printk("inode %lu\n", dir->i_ino); offset = 0; while ((void *)de < dlimit) { de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n", offset, de_len, de->name_len, de->name, de->name_len, le32_to_cpu(de->inode)); if (ext4_check_dir_entry(dir, NULL, de, bh, inline_start, inline_size, offset)) BUG(); offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } } #else #define ext4_show_inline_dir(dir, bh, inline_start, inline_size) #endif /* * Add a new entry into a inline dir. * It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { int err; struct ext4_dir_entry_2 *de; err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, inline_size, fname, &de); if (err) return err; BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh, EXT4_JTR_NONE); if (err) return err; ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend * on this. * * XXX similarly, too many callers depend on * ext4_new_inode() setting the times, but error * recovery deletes the inode, so the worst that can * happen is that the times are slightly out of date * and/or different from the directory change time. */ dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; } static void *ext4_get_inline_xattr_pos(struct inode *inode, struct ext4_iloc *iloc) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; BUG_ON(!EXT4_I(inode)->i_inline_off); header = IHDR(inode, ext4_raw_inode(iloc)); entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + EXT4_I(inode)->i_inline_off); return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); } /* Set the final de to cover the whole block. */ static void ext4_update_final_de(void *de_buf, int old_size, int new_size) { struct ext4_dir_entry_2 *de, *prev_de; void *limit; int de_len; de = (struct ext4_dir_entry_2 *)de_buf; if (old_size) { limit = de_buf + old_size; do { prev_de = de; de_len = ext4_rec_len_from_disk(de->rec_len, old_size); de_buf += de_len; de = (struct ext4_dir_entry_2 *)de_buf; } while (de_buf < limit); prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - old_size, new_size); } else { /* this is just created, so create an empty entry. */ de->inode = 0; de->rec_len = ext4_rec_len_to_disk(new_size, new_size); } } static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, struct ext4_iloc *iloc) { int ret; int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; int new_size = get_max_inline_xattr_value_size(dir, iloc); if (new_size - old_size <= ext4_dir_rec_len(1, NULL)) return -ENOSPC; ret = ext4_update_inline_data(handle, dir, new_size + EXT4_MIN_INLINE_DATA_SIZE); if (ret) return ret; ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE); dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; return 0; } static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { int ret; ret = ext4_create_inline_data(handle, inode, inline_size); if (ret) { ext4_msg(inode->i_sb, KERN_EMERG, "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", inode->i_ino, ret); return; } ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } static int ext4_finish_convert_inline_dir(handle_t *handle, struct inode *inode, struct buffer_head *dir_block, void *buf, int inline_size) { int err, csum_size = 0, header_size = 0; struct ext4_dir_entry_2 *de; void *target = dir_block->b_data; /* * First create "." and ".." and then copy the dir information * back to the block. */ de = (struct ext4_dir_entry_2 *)target; de = ext4_init_dot_dotdot(inode, de, inode->i_sb->s_blocksize, csum_size, le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); header_size = (void *)de - target; memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; i_size_write(inode, inode->i_sb->s_blocksize); EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; ext4_update_final_de(dir_block->b_data, inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, inode->i_sb->s_blocksize - csum_size); if (csum_size) ext4_initialize_dirent_tail(dir_block, inode->i_sb->s_blocksize); set_buffer_uptodate(dir_block); unlock_buffer(dir_block); err = ext4_handle_dirty_dirblock(handle, inode, dir_block); if (err) return err; set_buffer_verified(dir_block); return ext4_mark_inode_dirty(handle, inode); } static int ext4_convert_inline_data_nolock(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int error; void *buf = NULL; struct buffer_head *data_bh = NULL; struct ext4_map_blocks map; int inline_size; inline_size = ext4_get_inline_size(inode); buf = kmalloc(inline_size, GFP_NOFS); if (!buf) { error = -ENOMEM; goto out; } error = ext4_read_inline_data(inode, buf, inline_size, iloc); if (error < 0) goto out; /* * Make sure the inline directory entries pass checks before we try to * convert them, so that we avoid touching stuff that needs fsck. */ if (S_ISDIR(inode->i_mode)) { error = ext4_check_all_de(inode, iloc->bh, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); if (error) goto out; } error = ext4_destroy_inline_data_nolock(handle, inode); if (error) goto out; map.m_lblk = 0; map.m_len = 1; map.m_flags = 0; error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); if (error < 0) goto out_restore; if (!(map.m_flags & EXT4_MAP_MAPPED)) { error = -EIO; goto out_restore; } data_bh = sb_getblk(inode->i_sb, map.m_pblk); if (!data_bh) { error = -ENOMEM; goto out_restore; } lock_buffer(data_bh); error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(data_bh); error = -EIO; goto out_restore; } memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); if (!S_ISDIR(inode->i_mode)) { memcpy(data_bh->b_data, buf, inline_size); set_buffer_uptodate(data_bh); unlock_buffer(data_bh); error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { error = ext4_finish_convert_inline_dir(handle, inode, data_bh, buf, inline_size); } out_restore: if (error) ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); out: brelse(data_bh); kfree(buf); return error; } /* * Try to add the new entry to the inline data. * If succeeds, return 0. If not, extended the inline dir and copied data to * the new created block. */ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { int ret, ret2, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; ret = ext4_get_inode_loc(dir, &iloc); if (ret) return ret; ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) goto out; inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; /* check whether it can be inserted to inline xattr space. */ inline_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; if (!inline_size) { /* Try to use the xattr space.*/ ret = ext4_update_inline_dir(handle, dir, &iloc); if (ret && ret != -ENOSPC) goto out; inline_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; } if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; } /* * The inline space is filled up, so create a new block for it. * As the extent tree will be created, we have to save the inline * dir first. */ ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); out: ext4_write_unlock_xattr(dir, &no_expand); ret2 = ext4_mark_inode_dirty(handle, dir); if (unlikely(ret2 && !ret)) ret = ret2; brelse(iloc.bh); return ret; } /* * This function fills a red-black tree with information from an * inlined dir. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ int ext4_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data) { int err = 0, count = 0; unsigned int parent_ino; int pos; struct ext4_dir_entry_2 *de; struct inode *inode = file_inode(dir_file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; struct fscrypt_str tmp_str; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; pos = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); while (pos < inline_size) { /* * As inlined dir doesn't store any information about '.' and * only the inode number of '..' is stored, we have to handle * them differently. */ if (pos == 0) { fake.inode = cpu_to_le32(inode->i_ino); fake.name_len = 1; strcpy(fake.name, "."); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_OFFSET; } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { fake.inode = cpu_to_le32(parent_ino); fake.name_len = 2; strcpy(fake.name, ".."); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_SIZE; } else { de = (struct ext4_dir_entry_2 *)(dir_buf + pos); pos += ext4_rec_len_from_disk(de->rec_len, inline_size); if (ext4_check_dir_entry(inode, dir_file, de, iloc.bh, dir_buf, inline_size, pos)) { ret = count; goto out; } } if (ext4_hash_in_dirent(dir)) { hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo); if (err) { ret = err; goto out; } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); if (err) { ret = err; goto out; } count++; } ret = count; out: kfree(dir_buf); brelse(iloc.bh); return ret; } /* * So this function is called when the volume is mkfsed with * dir_index disabled. In order to keep f_pos persistent * after we convert from an inlined dir to a blocked based, * we just pretend that we are a normal dir and return the * offset as if '.' and '..' really take place. * */ int ext4_read_inline_dir(struct file *file, struct dir_context *ctx, int *has_inline_data) { unsigned int offset, parent_ino; int i; struct ext4_dir_entry_2 *de; struct super_block *sb; struct inode *inode = file_inode(file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; int dotdot_offset, dotdot_size, extra_offset, extra_size; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; ret = 0; sb = inode->i_sb; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); offset = ctx->pos; /* * dotdot_offset and dotdot_size is the real offset and * size for ".." and "." if the dir is block based while * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. * So we will use extra_offset and extra_size to indicate them * during the inline dir iteration. */ dotdot_offset = ext4_dir_rec_len(1, NULL); dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL); extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; extra_size = extra_offset + inline_size; /* * If the version has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the inline * dir to make sure. */ if (!inode_eq_iversion(inode, file->f_version)) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and * ".." is dotdot_offset. */ if (!i) { i = dotdot_offset; continue; } else if (i == dotdot_offset) { i = dotdot_size; continue; } /* for other entry, the real offset in * the buf has to be tuned accordingly. */ de = (struct ext4_dir_entry_2 *) (dir_buf + i - extra_offset); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, extra_size) < ext4_dir_rec_len(1, NULL)) break; i += ext4_rec_len_from_disk(de->rec_len, extra_size); } offset = i; ctx->pos = offset; file->f_version = inode_query_iversion(inode); } while (ctx->pos < extra_size) { if (ctx->pos == 0) { if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) goto out; ctx->pos = dotdot_offset; continue; } if (ctx->pos == dotdot_offset) { if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) goto out; ctx->pos = dotdot_size; continue; } de = (struct ext4_dir_entry_2 *) (dir_buf + ctx->pos - extra_offset); if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, extra_size, ctx->pos)) goto out; if (le32_to_cpu(de->inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto out; } ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size); } out: kfree(dir_buf); brelse(iloc.bh); return ret; } struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval) { struct ext4_iloc iloc; *retval = ext4_get_inode_loc(inode, &iloc); if (*retval) return NULL; *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; return iloc.bh; } /* * Try to create the inline data for the new dir. * If it succeeds, return 0, otherwise return the error. * In case of ENOSPC, the caller should create the normal disk layout dir. */ int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode) { int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; struct ext4_iloc iloc; struct ext4_dir_entry_2 *de; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; ret = ext4_prepare_inline_data(handle, inode, inline_size); if (ret) goto out; /* * For inline dir, we only save the inode information for the ".." * and create a fake dentry to cover the left space. */ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; de->inode = cpu_to_le32(parent->i_ino); de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); de->inode = 0; de->rec_len = ext4_rec_len_to_disk( inline_size - EXT4_INLINE_DOTDOT_SIZE, inline_size); set_nlink(inode, 2); inode->i_size = EXT4_I(inode)->i_disksize = inline_size; out: brelse(iloc.bh); return ret; } struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) { struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; int ret; void *inline_start; int inline_size; ret = ext4_get_inode_loc(dir, &is.iloc); if (ret) return ERR_PTR(ret); down_read(&EXT4_I(dir)->xattr_sem); ret = ext4_xattr_ibody_find(dir, &i, &is); if (ret) goto out; if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } inline_start = (void *)ext4_raw_inode(&is.iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) goto out; if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) goto out; inline_start = ext4_get_inline_xattr_pos(dir, &is.iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; out: brelse(is.iloc.bh); if (ret < 0) is.iloc.bh = ERR_PTR(ret); else is.iloc.bh = NULL; out_find: up_read(&EXT4_I(dir)->xattr_sem); return is.iloc.bh; } int ext4_delete_inline_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data) { int err, inline_size, no_expand; struct ext4_iloc iloc; void *inline_start; err = ext4_get_inode_loc(dir, &iloc); if (err) return err; ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < EXT4_MIN_INLINE_DATA_SIZE) { inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; } else { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; err = ext4_generic_delete_entry(dir, de_del, bh, inline_start, inline_size, 0); if (err) goto out; ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); out: ext4_write_unlock_xattr(dir, &no_expand); if (likely(err == 0)) err = ext4_mark_inode_dirty(handle, dir); brelse(iloc.bh); if (err != -ENOENT) ext4_std_error(dir->i_sb, err); return err; } /* * Get the inline dentry at offset. */ static inline struct ext4_dir_entry_2 * ext4_get_inline_entry(struct inode *inode, struct ext4_iloc *iloc, unsigned int offset, void **inline_start, int *inline_size) { void *inline_pos; BUG_ON(offset > ext4_get_inline_size(inode)); if (offset < EXT4_MIN_INLINE_DATA_SIZE) { inline_pos = (void *)ext4_raw_inode(iloc)->i_block; *inline_size = EXT4_MIN_INLINE_DATA_SIZE; } else { inline_pos = ext4_get_inline_xattr_pos(inode, iloc); offset -= EXT4_MIN_INLINE_DATA_SIZE; *inline_size = ext4_get_inline_size(inode) - EXT4_MIN_INLINE_DATA_SIZE; } if (inline_start) *inline_start = inline_pos; return (struct ext4_dir_entry_2 *)(inline_pos + offset); } bool empty_inline_dir(struct inode *dir, int *has_inline_data) { int err, inline_size; struct ext4_iloc iloc; size_t inline_len; void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; bool ret = false; err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE_ERR(dir, -err, "error %d getting inode %lu block", err, dir->i_ino); return false; } down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; ret = true; goto out; } de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; if (!le32_to_cpu(de->inode)) { ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - no `..'", dir->i_ino); goto out; } inline_len = ext4_get_inline_size(dir); offset = EXT4_INLINE_DOTDOT_SIZE; while (offset < inline_len) { de = ext4_get_inline_entry(dir, &iloc, offset, &inline_pos, &inline_size); if (ext4_check_dir_entry(dir, NULL, de, iloc.bh, inline_pos, inline_size, offset)) { ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - " "inode %u, rec_len %u, name_len %d" "inline size %d", dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); goto out; } if (le32_to_cpu(de->inode)) { goto out; } offset += ext4_rec_len_from_disk(de->rec_len, inline_size); } ret = true; out: up_read(&EXT4_I(dir)->xattr_sem); brelse(iloc.bh); return ret; } int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret, no_expand; ext4_write_lock_xattr(inode, &no_expand); ret = ext4_destroy_inline_data_nolock(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); return ret; } int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap) { __u64 addr; int error = -EAGAIN; struct ext4_iloc iloc; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) goto out; error = ext4_get_inode_loc(inode, &iloc); if (error) goto out; addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; addr += offsetof(struct ext4_inode, i_block); brelse(iloc.bh); iomap->addr = addr; iomap->offset = 0; iomap->length = min_t(loff_t, ext4_get_inline_size(inode), i_size_read(inode)); iomap->type = IOMAP_INLINE; iomap->flags = 0; out: up_read(&EXT4_I(inode)->xattr_sem); return error; } int ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; int inline_size, value_len, needed_blocks, no_expand, err = 0; size_t i_size; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; needed_blocks = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { ext4_write_unlock_xattr(inode, &no_expand); *has_inline = 0; ext4_journal_stop(handle); return 0; } if ((err = ext4_orphan_add(handle, inode)) != 0) goto out; if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0) goto out; down_write(&EXT4_I(inode)->i_data_sem); i_size = inode->i_size; inline_size = ext4_get_inline_size(inode); EXT4_I(inode)->i_disksize = i_size; if (i_size < inline_size) { /* * if there's inline data to truncate and this file was * converted to extents after that inline data was written, * the extent status cache must be cleared to avoid leaving * behind stale delayed allocated extent entries */ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { retry: err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (err == -ENOMEM) { cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } if (err) goto out_error; } /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) goto out_error; BUG_ON(is.s.not_found); value_len = le32_to_cpu(is.s.here->e_value_size); value = kmalloc(value_len, GFP_NOFS); if (!value) { err = -ENOMEM; goto out_error; } err = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, value_len); if (err <= 0) goto out_error; i.value = value; i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; err = ext4_xattr_ibody_set(handle, inode, &i, &is); if (err) goto out_error; } /* Clear the content within i_blocks. */ if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; memset(p + i_size, 0, EXT4_MIN_INLINE_DATA_SIZE - i_size); } EXT4_I(inode)->i_inline_size = i_size < EXT4_MIN_INLINE_DATA_SIZE ? EXT4_MIN_INLINE_DATA_SIZE : i_size; } out_error: up_write(&EXT4_I(inode)->i_data_sem); out: brelse(is.iloc.bh); ext4_write_unlock_xattr(inode, &no_expand); kfree(value); if (inode->i_nlink) ext4_orphan_del(handle, inode); if (err == 0) { inode->i_mtime = inode->i_ctime = current_time(inode); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); } ext4_journal_stop(handle); return err; } int ext4_convert_inline_data(struct inode *inode) { int error, needed_blocks, no_expand; handle_t *handle; struct ext4_iloc iloc; if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { /* * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is * cleared. This means we are in the middle of moving of * inline data to delay allocated block. Just force writeout * here to finish conversion. */ error = filemap_flush(inode->i_mapping); if (error) return error; if (!ext4_has_inline_data(inode)) return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); iloc.bh = NULL; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_free; } ext4_write_lock_xattr(inode, &no_expand); if (ext4_has_inline_data(inode)) error = ext4_convert_inline_data_nolock(handle, inode, &iloc); ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); out_free: brelse(iloc.bh); return error; } |
706 489 487 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _ASM_X86_INSN_H #define _ASM_X86_INSN_H /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2009 */ #include <asm/byteorder.h> /* insn_attr_t is defined in inat.h */ #include <asm/inat.h> /* __ignore_sync_check__ */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) struct insn_field { union { insn_value_t value; insn_byte_t bytes[4]; }; /* !0 if we've run insn_get_xxx() for this field */ unsigned char got; unsigned char nbytes; }; static inline void insn_field_set(struct insn_field *p, insn_value_t v, unsigned char n) { p->value = v; p->nbytes = n; } static inline void insn_set_byte(struct insn_field *p, unsigned char n, insn_byte_t v) { p->bytes[n] = v; } #else struct insn_field { insn_value_t value; union { insn_value_t little; insn_byte_t bytes[4]; }; /* !0 if we've run insn_get_xxx() for this field */ unsigned char got; unsigned char nbytes; }; static inline void insn_field_set(struct insn_field *p, insn_value_t v, unsigned char n) { p->value = v; p->little = __cpu_to_le32(v); p->nbytes = n; } static inline void insn_set_byte(struct insn_field *p, unsigned char n, insn_byte_t v) { p->bytes[n] = v; p->value = __le32_to_cpu(p->little); } #endif struct insn { struct insn_field prefixes; /* * Prefixes * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ struct insn_field vex_prefix; /* VEX prefix */ struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 * opcode.bytes[2]: opcode3 */ struct insn_field modrm; struct insn_field sib; struct insn_field displacement; union { struct insn_field immediate; struct insn_field moffset1; /* for 64bit MOV */ struct insn_field immediate1; /* for 64bit imm or off16/32 */ }; union { struct insn_field moffset2; /* for 64bit MOV */ struct insn_field immediate2; /* for 64bit imm or seg16 */ }; int emulate_prefix_size; insn_attr_t attr; unsigned char opnd_bytes; unsigned char addr_bytes; unsigned char length; unsigned char x86_64; const insn_byte_t *kaddr; /* kernel address of insn to analyze */ const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ const insn_byte_t *next_byte; }; #define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) #define X86_MODRM_RM(modrm) ((modrm) & 0x07) #define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6) #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) #define X86_REX_W(rex) ((rex) & 8) #define X86_REX_R(rex) ((rex) & 4) #define X86_REX_X(rex) ((rex) & 2) #define X86_REX_B(rex) ((rex) & 1) /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ #define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */ #define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */ #define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ #define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ /* VEX bit fields */ #define X86_EVEX_M(vex) ((vex) & 0x03) /* EVEX Byte1 */ #define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ #define X86_VEX2_M 1 /* VEX2.M always 1 */ #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); extern int insn_get_opcode(struct insn *insn); extern int insn_get_modrm(struct insn *insn); extern int insn_get_sib(struct insn *insn); extern int insn_get_displacement(struct insn *insn); extern int insn_get_immediate(struct insn *insn); extern int insn_get_length(struct insn *insn); enum insn_mode { INSN_MODE_32, INSN_MODE_64, /* Mode is determined by the current kernel build. */ INSN_MODE_KERN, INSN_NUM_MODES, }; extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m); #define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN) /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) { insn_get_modrm(insn); } /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.value != 0); } static inline int insn_is_evex(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.nbytes == 4); } static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; } static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX2_M; else if (insn->vex_prefix.nbytes == 3) /* 3 bytes VEX */ return X86_VEX3_M(insn->vex_prefix.bytes[1]); else /* EVEX */ return X86_EVEX_M(insn->vex_prefix.bytes[1]); } static inline insn_byte_t insn_vex_p_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX_P(insn->vex_prefix.bytes[1]); else return X86_VEX_P(insn->vex_prefix.bytes[2]); } /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { if (insn_is_avx(insn)) return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); return 0; } /* Offset of each field from kaddr */ static inline int insn_offset_rex_prefix(struct insn *insn) { return insn->prefixes.nbytes; } static inline int insn_offset_vex_prefix(struct insn *insn) { return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; } static inline int insn_offset_opcode(struct insn *insn) { return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes; } static inline int insn_offset_modrm(struct insn *insn) { return insn_offset_opcode(insn) + insn->opcode.nbytes; } static inline int insn_offset_sib(struct insn *insn) { return insn_offset_modrm(insn) + insn->modrm.nbytes; } static inline int insn_offset_displacement(struct insn *insn) { return insn_offset_sib(insn) + insn->sib.nbytes; } static inline int insn_offset_immediate(struct insn *insn) { return insn_offset_displacement(insn) + insn->displacement.nbytes; } /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix * and the index is stored in @idx (note that this @idx is just for a cursor, * do not change it.) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ #define for_each_insn_prefix(insn, idx, prefix) \ for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e /* * Intel SDM Vol.3A 6.8.3 states; * "Any single-step trap that would be delivered following the MOV to SS * instruction or POP to SS instruction (because EFLAGS.TF is 1) is * suppressed." * This function returns true if @insn is MOV SS or POP SS. On these * instructions, single stepping is suppressed. */ static inline int insn_masking_exception(struct insn *insn) { return insn->opcode.bytes[0] == POP_SS_OPCODE || (insn->opcode.bytes[0] == MOV_SREG_OPCODE && X86_MODRM_REG(insn->modrm.bytes[0]) == 2); } #endif /* _ASM_X86_INSN_H */ |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2020 Facebook */ #include <linux/fs.h> #include <linux/anon_inodes.h> #include <linux/filter.h> #include <linux/bpf.h> struct bpf_iter_target_info { struct list_head list; const struct bpf_iter_reg *reg_info; u32 btf_id; /* cached value */ }; struct bpf_iter_link { struct bpf_link link; struct bpf_iter_aux_info aux; struct bpf_iter_target_info *tinfo; }; struct bpf_iter_priv_data { struct bpf_iter_target_info *tinfo; const struct bpf_iter_seq_info *seq_info; struct bpf_prog *prog; u64 session_id; u64 seq_num; bool done_stop; u8 target_private[] __aligned(8); }; static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, const struct bpf_iter_seq_info *seq_info); static void bpf_iter_inc_seq_num(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); iter_priv->seq_num++; } static void bpf_iter_dec_seq_num(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); iter_priv->seq_num--; } static void bpf_iter_done_stop(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); iter_priv->done_stop = true; } static bool bpf_iter_support_resched(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED; } /* maximum visited objects before bailing out */ #define MAX_ITER_OBJECTS 1000000 /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): * . fixed buffer size (PAGE_SIZE) * . assuming no_llseek * . stop() may call bpf program, handling potential overflow there */ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; size_t n, offs, copied = 0; int err = 0, num_objs = 0; bool can_resched; void *p; mutex_lock(&seq->lock); if (!seq->buf) { seq->size = PAGE_SIZE << 3; seq->buf = kvmalloc(seq->size, GFP_KERNEL); if (!seq->buf) { err = -ENOMEM; goto done; } } if (seq->count) { n = min(seq->count, size); err = copy_to_user(buf, seq->buf + seq->from, n); if (err) { err = -EFAULT; goto done; } seq->count -= n; seq->from += n; copied = n; goto done; } seq->from = 0; p = seq->op->start(seq, &seq->index); if (!p) goto stop; if (IS_ERR(p)) { err = PTR_ERR(p); seq->op->stop(seq, p); seq->count = 0; goto done; } err = seq->op->show(seq, p); if (err > 0) { /* object is skipped, decrease seq_num, so next * valid object can reuse the same seq_num. */ bpf_iter_dec_seq_num(seq); seq->count = 0; } else if (err < 0 || seq_has_overflowed(seq)) { if (!err) err = -E2BIG; seq->op->stop(seq, p); seq->count = 0; goto done; } can_resched = bpf_iter_support_resched(seq); while (1) { loff_t pos = seq->index; num_objs++; offs = seq->count; p = seq->op->next(seq, p, &seq->index); if (pos == seq->index) { pr_info_ratelimited("buggy seq_file .next function %ps " "did not updated position index\n", seq->op->next); seq->index++; } if (IS_ERR_OR_NULL(p)) break; /* got a valid next object, increase seq_num */ bpf_iter_inc_seq_num(seq); if (seq->count >= size) break; if (num_objs >= MAX_ITER_OBJECTS) { if (offs == 0) { err = -EAGAIN; seq->op->stop(seq, p); goto done; } break; } err = seq->op->show(seq, p); if (err > 0) { bpf_iter_dec_seq_num(seq); seq->count = offs; } else if (err < 0 || seq_has_overflowed(seq)) { seq->count = offs; if (offs == 0) { if (!err) err = -E2BIG; seq->op->stop(seq, p); goto done; } break; } if (can_resched) cond_resched(); } stop: offs = seq->count; /* bpf program called if !p */ seq->op->stop(seq, p); if (!p) { if (!seq_has_overflowed(seq)) { bpf_iter_done_stop(seq); } else { seq->count = offs; if (offs == 0) { err = -E2BIG; goto done; } } } n = min(seq->count, size); err = copy_to_user(buf, seq->buf, n); if (err) { err = -EFAULT; goto done; } copied = n; seq->count -= n; seq->from = n; done: if (!copied) copied = err; else *ppos += copied; mutex_unlock(&seq->lock); return copied; } static const struct bpf_iter_seq_info * __get_seq_info(struct bpf_iter_link *link) { const struct bpf_iter_seq_info *seq_info; if (link->aux.map) { seq_info = link->aux.map->ops->iter_seq_info; if (seq_info) return seq_info; } return link->tinfo->reg_info->seq_info; } static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; return prepare_seq_file(file, link, __get_seq_info(link)); } static int iter_release(struct inode *inode, struct file *file) { struct bpf_iter_priv_data *iter_priv; struct seq_file *seq; seq = file->private_data; if (!seq) return 0; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); if (iter_priv->seq_info->fini_seq_private) iter_priv->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; return seq_release_private(inode, file); } const struct file_operations bpf_iter_fops = { .open = iter_open, .llseek = no_llseek, .read = bpf_seq_read, .release = iter_release, }; /* The argument reg_info will be cached in bpf_iter_target_info. * The common practice is to declare target reg_info as * a const static variable and passed as an argument to * bpf_iter_reg_target(). */ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) return -ENOMEM; tinfo->reg_info = reg_info; INIT_LIST_HEAD(&tinfo->list); mutex_lock(&targets_mutex); list_add(&tinfo->list, &targets); mutex_unlock(&targets_mutex); return 0; } void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; bool found = false; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { if (reg_info == tinfo->reg_info) { list_del(&tinfo->list); kfree(tinfo); found = true; break; } } mutex_unlock(&targets_mutex); WARN_ON(found == false); } static void cache_btf_id(struct bpf_iter_target_info *tinfo, struct bpf_prog *prog) { tinfo->btf_id = prog->aux->attach_btf_id; } bool bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; u32 prog_btf_id = prog->aux->attach_btf_id; const char *prefix = BPF_ITER_FUNC_PREFIX; struct bpf_iter_target_info *tinfo; int prefix_len = strlen(prefix); bool supported = false; if (strncmp(attach_fname, prefix, prefix_len)) return false; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { supported = true; break; } if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { cache_btf_id(tinfo, prog); supported = true; break; } } mutex_unlock(&targets_mutex); if (supported) { prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; } return supported; } const struct bpf_func_proto * bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_iter_target_info *tinfo; const struct bpf_func_proto *fn = NULL; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { if (tinfo->btf_id == prog->aux->attach_btf_id) { const struct bpf_iter_reg *reg_info; reg_info = tinfo->reg_info; if (reg_info->get_func_proto) fn = reg_info->get_func_proto(func_id, prog); break; } } mutex_unlock(&targets_mutex); return fn; } static void bpf_iter_link_release(struct bpf_link *link) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); if (iter_link->tinfo->reg_info->detach_target) iter_link->tinfo->reg_info->detach_target(&iter_link->aux); } static void bpf_iter_link_dealloc(struct bpf_link *link) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); kfree(iter_link); } static int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { int ret = 0; mutex_lock(&link_mutex); if (old_prog && link->prog != old_prog) { ret = -EPERM; goto out_unlock; } if (link->prog->type != new_prog->type || link->prog->expected_attach_type != new_prog->expected_attach_type || link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) { ret = -EINVAL; goto out_unlock; } old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: mutex_unlock(&link_mutex); return ret; } static void bpf_iter_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); bpf_iter_show_fdinfo_t show_fdinfo; seq_printf(seq, "target_name:\t%s\n", iter_link->tinfo->reg_info->target); show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo; if (show_fdinfo) show_fdinfo(&iter_link->aux, seq); } static int bpf_iter_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); char __user *ubuf = u64_to_user_ptr(info->iter.target_name); bpf_iter_fill_link_info_t fill_link_info; u32 ulen = info->iter.target_name_len; const char *target_name; u32 target_len; if (!ulen ^ !ubuf) return -EINVAL; target_name = iter_link->tinfo->reg_info->target; target_len = strlen(target_name); info->iter.target_name_len = target_len + 1; if (ubuf) { if (ulen >= target_len + 1) { if (copy_to_user(ubuf, target_name, target_len + 1)) return -EFAULT; } else { char zero = '\0'; if (copy_to_user(ubuf, target_name, ulen - 1)) return -EFAULT; if (put_user(zero, ubuf + ulen - 1)) return -EFAULT; return -ENOSPC; } } fill_link_info = iter_link->tinfo->reg_info->fill_link_info; if (fill_link_info) return fill_link_info(&iter_link->aux, info); return 0; } static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, .update_prog = bpf_iter_link_replace, .show_fdinfo = bpf_iter_link_show_fdinfo, .fill_link_info = bpf_iter_link_fill_link_info, }; bool bpf_link_is_iter(struct bpf_link *link) { return link->ops == &bpf_iter_link_lops; } int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; union bpf_iter_link_info linfo; struct bpf_iter_link *link; u32 prog_btf_id, linfo_len; bool existed = false; bpfptr_t ulinfo; int err; if (attr->link_create.target_fd || attr->link_create.flags) return -EINVAL; memset(&linfo, 0, sizeof(union bpf_iter_link_info)); ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel); linfo_len = attr->link_create.iter_info_len; if (bpfptr_is_null(ulinfo) ^ !linfo_len) return -EINVAL; if (!bpfptr_is_null(ulinfo)) { err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), linfo_len); if (err) return err; linfo_len = min_t(u32, linfo_len, sizeof(linfo)); if (copy_from_bpfptr(&linfo, ulinfo, linfo_len)) return -EFAULT; } prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { if (tinfo->btf_id == prog_btf_id) { existed = true; break; } } mutex_unlock(&targets_mutex); if (!existed) return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); link->tinfo = tinfo; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); return err; } if (tinfo->reg_info->attach_target) { err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux); if (err) { bpf_link_cleanup(&link_primer); return err; } } return bpf_link_settle(&link_primer); } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, struct bpf_iter_target_info *tinfo, const struct bpf_iter_seq_info *seq_info, struct bpf_prog *prog) { priv_data->tinfo = tinfo; priv_data->seq_info = seq_info; priv_data->prog = prog; priv_data->session_id = atomic64_inc_return(&session_id); priv_data->seq_num = 0; priv_data->done_stop = false; } static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, const struct bpf_iter_seq_info *seq_info) { struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; struct bpf_prog *prog; u32 total_priv_dsize; struct seq_file *seq; int err = 0; mutex_lock(&link_mutex); prog = link->link.prog; bpf_prog_inc(prog); mutex_unlock(&link_mutex); tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + seq_info->seq_priv_size; priv_data = __seq_open_private(file, seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } if (seq_info->init_seq_private) { err = seq_info->init_seq_private(priv_data->target_private, &link->aux); if (err) goto release_seq_file; } init_seq_meta(priv_data, tinfo, seq_info, prog); seq = file->private_data; seq->private = priv_data->target_private; return 0; release_seq_file: seq_release_private(file->f_inode, file); file->private_data = NULL; release_prog: bpf_prog_put(prog); return err; } int bpf_iter_new_fd(struct bpf_link *link) { struct bpf_iter_link *iter_link; struct file *file; unsigned int flags; int err, fd; if (link->ops != &bpf_iter_link_lops) return -EINVAL; flags = O_RDONLY | O_CLOEXEC; fd = get_unused_fd_flags(flags); if (fd < 0) return fd; file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); if (IS_ERR(file)) { err = PTR_ERR(file); goto free_fd; } iter_link = container_of(link, struct bpf_iter_link, link); err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); if (err) goto free_file; fd_install(fd, file); return fd; free_file: fput(file); free_fd: put_unused_fd(fd); return err; } struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) { struct bpf_iter_priv_data *iter_priv; struct seq_file *seq; void *seq_priv; seq = meta->seq; if (seq->file->f_op != &bpf_iter_fops) return NULL; seq_priv = seq->private; iter_priv = container_of(seq_priv, struct bpf_iter_priv_data, target_private); if (in_stop && iter_priv->done_stop) return NULL; meta->session_id = iter_priv->session_id; meta->seq_num = iter_priv->seq_num; return iter_priv->prog; } int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { int ret; rcu_read_lock(); migrate_disable(); ret = bpf_prog_run(prog, ctx); migrate_enable(); rcu_read_unlock(); /* bpf program can only return 0 or 1: * 0 : okay * 1 : retry the same object * The bpf_iter_run_prog() return value * will be seq_ops->show() return value. */ return ret == 0 ? 0 : -EAGAIN; } BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, void *, callback_ctx, u64, flags) { return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); } const struct bpf_func_proto bpf_for_each_map_elem_proto = { .func = bpf_for_each_map_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_FUNC, .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; |
175 209 21 154 1 13 118 176 417 410 435 45 20 415 25 48 274 224 223 225 5 9 6 32 36 8 15 6 110 52 53 104 103 91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 | // SPDX-License-Identifier: GPL-2.0 /* * Convert integer string representation to an integer. * If an integer doesn't fit into specified type, -E is returned. * * Integer starts with optional sign. * kstrtou*() functions do not accept sign "-". * * Radix 0 means autodetection: leading "0x" implies radix 16, * leading "0" implies radix 8, otherwise radix is 10. * Autodetection hints work after optional sign, but not before. * * If -E is returned, result is not touched. */ #include <linux/ctype.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/kstrtox.h> #include <linux/math64.h> #include <linux/types.h> #include <linux/uaccess.h> #include "kstrtox.h" const char *_parse_integer_fixup_radix(const char *s, unsigned int *base) { if (*base == 0) { if (s[0] == '0') { if (_tolower(s[1]) == 'x' && isxdigit(s[2])) *base = 16; else *base = 8; } else *base = 10; } if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x') s += 2; return s; } /* * Convert non-negative integer string representation in explicitly given radix * to an integer. A maximum of max_chars characters will be converted. * * Return number of characters consumed maybe or-ed with overflow bit. * If overflow occurs, result integer (incorrect) is still returned. * * Don't you dare use this function. */ unsigned int _parse_integer_limit(const char *s, unsigned int base, unsigned long long *p, size_t max_chars) { unsigned long long res; unsigned int rv; res = 0; rv = 0; while (max_chars--) { unsigned int c = *s; unsigned int lc = c | 0x20; /* don't tolower() this line */ unsigned int val; if ('0' <= c && c <= '9') val = c - '0'; else if ('a' <= lc && lc <= 'f') val = lc - 'a' + 10; else break; if (val >= base) break; /* * Check for overflow only if we are within range of * it in the max base we support (16) */ if (unlikely(res & (~0ull << 60))) { if (res > div_u64(ULLONG_MAX - val, base)) rv |= KSTRTOX_OVERFLOW; } res = res * base + val; rv++; s++; } *p = res; return rv; } unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p) { return _parse_integer_limit(s, base, p, INT_MAX); } static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res) { unsigned long long _res; unsigned int rv; s = _parse_integer_fixup_radix(s, &base); rv = _parse_integer(s, base, &_res); if (rv & KSTRTOX_OVERFLOW) return -ERANGE; if (rv == 0) return -EINVAL; s += rv; if (*s == '\n') s++; if (*s) return -EINVAL; *res = _res; return 0; } /** * kstrtoull - convert a string to an unsigned long long * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign, but not a minus sign. * @base: The number base to use. The maximum supported base is 16. If base is * given as 0, then the base of the string is automatically detected with the * conventional semantics - If it begins with 0x the number will be parsed as a * hexadecimal (case insensitive), if it otherwise begins with 0, it will be * parsed as an octal number. Otherwise it will be parsed as a decimal. * @res: Where to write the result of the conversion on success. * * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtoull(). Return code must be checked. */ int kstrtoull(const char *s, unsigned int base, unsigned long long *res) { if (s[0] == '+') s++; return _kstrtoull(s, base, res); } EXPORT_SYMBOL(kstrtoull); /** * kstrtoll - convert a string to a long long * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign or a minus sign. * @base: The number base to use. The maximum supported base is 16. If base is * given as 0, then the base of the string is automatically detected with the * conventional semantics - If it begins with 0x the number will be parsed as a * hexadecimal (case insensitive), if it otherwise begins with 0, it will be * parsed as an octal number. Otherwise it will be parsed as a decimal. * @res: Where to write the result of the conversion on success. * * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtoll(). Return code must be checked. */ int kstrtoll(const char *s, unsigned int base, long long *res) { unsigned long long tmp; int rv; if (s[0] == '-') { rv = _kstrtoull(s + 1, base, &tmp); if (rv < 0) return rv; if ((long long)-tmp > 0) return -ERANGE; *res = -tmp; } else { rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; if ((long long)tmp < 0) return -ERANGE; *res = tmp; } return 0; } EXPORT_SYMBOL(kstrtoll); /* Internal, do not use. */ int _kstrtoul(const char *s, unsigned int base, unsigned long *res) { unsigned long long tmp; int rv; rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; if (tmp != (unsigned long)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(_kstrtoul); /* Internal, do not use. */ int _kstrtol(const char *s, unsigned int base, long *res) { long long tmp; int rv; rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; if (tmp != (long)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(_kstrtol); /** * kstrtouint - convert a string to an unsigned int * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign, but not a minus sign. * @base: The number base to use. The maximum supported base is 16. If base is * given as 0, then the base of the string is automatically detected with the * conventional semantics - If it begins with 0x the number will be parsed as a * hexadecimal (case insensitive), if it otherwise begins with 0, it will be * parsed as an octal number. Otherwise it will be parsed as a decimal. * @res: Where to write the result of the conversion on success. * * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtoul(). Return code must be checked. */ int kstrtouint(const char *s, unsigned int base, unsigned int *res) { unsigned long long tmp; int rv; rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; if (tmp != (unsigned int)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtouint); /** * kstrtoint - convert a string to an int * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign or a minus sign. * @base: The number base to use. The maximum supported base is 16. If base is * given as 0, then the base of the string is automatically detected with the * conventional semantics - If it begins with 0x the number will be parsed as a * hexadecimal (case insensitive), if it otherwise begins with 0, it will be * parsed as an octal number. Otherwise it will be parsed as a decimal. * @res: Where to write the result of the conversion on success. * * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtol(). Return code must be checked. */ int kstrtoint(const char *s, unsigned int base, int *res) { long long tmp; int rv; rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; if (tmp != (int)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtoint); int kstrtou16(const char *s, unsigned int base, u16 *res) { unsigned long long tmp; int rv; rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; if (tmp != (u16)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtou16); int kstrtos16(const char *s, unsigned int base, s16 *res) { long long tmp; int rv; rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; if (tmp != (s16)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtos16); int kstrtou8(const char *s, unsigned int base, u8 *res) { unsigned long long tmp; int rv; rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; if (tmp != (u8)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtou8); int kstrtos8(const char *s, unsigned int base, s8 *res) { long long tmp; int rv; rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; if (tmp != (s8)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtos8); /** * kstrtobool - convert common user inputs into boolean values * @s: input string * @res: result * * This routine returns 0 iff the first character is one of 'Yy1Nn0', or * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL. Value * pointed to by res is updated upon finding a match. */ int kstrtobool(const char *s, bool *res) { if (!s) return -EINVAL; switch (s[0]) { case 'y': case 'Y': case '1': *res = true; return 0; case 'n': case 'N': case '0': *res = false; return 0; case 'o': case 'O': switch (s[1]) { case 'n': case 'N': *res = true; return 0; case 'f': case 'F': *res = false; return 0; default: break; } break; default: break; } return -EINVAL; } EXPORT_SYMBOL(kstrtobool); /* * Since "base" would be a nonsense argument, this open-codes the * _from_user helper instead of using the helper macro below. */ int kstrtobool_from_user(const char __user *s, size_t count, bool *res) { /* Longest string needed to differentiate, newline, terminator */ char buf[4]; count = min(count, sizeof(buf) - 1); if (copy_from_user(buf, s, count)) return -EFAULT; buf[count] = '\0'; return kstrtobool(buf, res); } EXPORT_SYMBOL(kstrtobool_from_user); #define kstrto_from_user(f, g, type) \ int f(const char __user *s, size_t count, unsigned int base, type *res) \ { \ /* sign, base 2 representation, newline, terminator */ \ char buf[1 + sizeof(type) * 8 + 1 + 1]; \ \ count = min(count, sizeof(buf) - 1); \ if (copy_from_user(buf, s, count)) \ return -EFAULT; \ buf[count] = '\0'; \ return g(buf, base, res); \ } \ EXPORT_SYMBOL(f) kstrto_from_user(kstrtoull_from_user, kstrtoull, unsigned long long); kstrto_from_user(kstrtoll_from_user, kstrtoll, long long); kstrto_from_user(kstrtoul_from_user, kstrtoul, unsigned long); kstrto_from_user(kstrtol_from_user, kstrtol, long); kstrto_from_user(kstrtouint_from_user, kstrtouint, unsigned int); kstrto_from_user(kstrtoint_from_user, kstrtoint, int); kstrto_from_user(kstrtou16_from_user, kstrtou16, u16); kstrto_from_user(kstrtos16_from_user, kstrtos16, s16); kstrto_from_user(kstrtou8_from_user, kstrtou8, u8); kstrto_from_user(kstrtos8_from_user, kstrtos8, s8); |
888 876 388 85 86 112 112 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 | // SPDX-License-Identifier: GPL-2.0-only /* * Monitoring code for network dropped packet alerts * * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <linux/percpu.h> #include <linux/timer.h> #include <linux/bitops.h> #include <linux/slab.h> #include <linux/module.h> #include <net/genetlink.h> #include <net/netevent.h> #include <net/flow_offload.h> #include <net/devlink.h> #include <trace/events/skb.h> #include <trace/events/napi.h> #include <trace/events/devlink.h> #include <asm/unaligned.h> #define TRACE_ON 1 #define TRACE_OFF 0 /* * Globals, our netlink socket pointer * and the work handle that will send up * netlink alerts */ static int trace_state = TRACE_OFF; static bool monitor_hw; /* net_dm_mutex * * An overall lock guarding every operation coming from userspace. * It also guards the global 'hw_stats_list' list. */ static DEFINE_MUTEX(net_dm_mutex); struct net_dm_stats { u64 dropped; struct u64_stats_sync syncp; }; #define NET_DM_MAX_HW_TRAP_NAME_LEN 40 struct net_dm_hw_entry { char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN]; u32 count; }; struct net_dm_hw_entries { u32 num_entries; struct net_dm_hw_entry entries[]; }; struct per_cpu_dm_data { raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and * 'send_timer' */ union { struct sk_buff *skb; struct net_dm_hw_entries *hw_entries; }; struct sk_buff_head drop_queue; struct work_struct dm_alert_work; struct timer_list send_timer; struct net_dm_stats stats; }; struct dm_hw_stat_delta { struct net_device *dev; unsigned long last_rx; struct list_head list; struct rcu_head rcu; unsigned long last_drop_val; }; static struct genl_family net_drop_monitor_family; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; static LIST_HEAD(hw_stats_list); static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; static u32 net_dm_trunc_len; static u32 net_dm_queue_len = 1000; struct net_dm_alert_ops { void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason); void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, int work, int budget); void (*work_item_func)(struct work_struct *work); void (*hw_work_item_func)(struct work_struct *work); void (*hw_trap_probe)(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata); }; struct net_dm_skb_cb { union { struct devlink_trap_metadata *hw_metadata; void *pc; }; }; #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) { size_t al; struct net_dm_alert_msg *msg; struct nlattr *nla; struct sk_buff *skb; unsigned long flags; void *msg_header; al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); al += sizeof(struct nlattr); skb = genlmsg_new(al, GFP_KERNEL); if (!skb) goto err; msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_ALERT); if (!msg_header) { nlmsg_free(skb); skb = NULL; goto err; } nla = nla_reserve(skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg)); if (!nla) { nlmsg_free(skb); skb = NULL; goto err; } msg = nla_data(nla); memset(msg, 0, al); goto out; err: mod_timer(&data->send_timer, jiffies + HZ / 10); out: raw_spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); raw_spin_unlock_irqrestore(&data->lock, flags); if (skb) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh); genlmsg_end(skb, genlmsg_data(gnlh)); } return skb; } static const struct genl_multicast_group dropmon_mcgrps[] = { { .name = "events", .cap_sys_admin = 1 }, }; static void send_dm_alert(struct work_struct *work) { struct sk_buff *skb; struct per_cpu_dm_data *data; data = container_of(work, struct per_cpu_dm_data, dm_alert_work); skb = reset_per_cpu_data(data); if (skb) genlmsg_multicast(&net_drop_monitor_family, skb, 0, 0, GFP_KERNEL); } /* * This is the timer function to delay the sending of an alert * in the event that more drops will arrive during the * hysteresis period. */ static void sched_send_work(struct timer_list *t) { struct per_cpu_dm_data *data = from_timer(data, t, send_timer); schedule_work(&data->dm_alert_work); } static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct net_dm_drop_point *point; struct nlmsghdr *nlh; struct nlattr *nla; int i; struct sk_buff *dskb; struct per_cpu_dm_data *data; unsigned long flags; local_irq_save(flags); data = this_cpu_ptr(&dm_cpu_data); raw_spin_lock(&data->lock); dskb = data->skb; if (!dskb) goto out; nlh = (struct nlmsghdr *)dskb->data; nla = genlmsg_data(nlmsg_data(nlh)); msg = nla_data(nla); point = msg->points; for (i = 0; i < msg->entries; i++) { if (!memcmp(&location, &point->pc, sizeof(void *))) { point->count++; goto out; } point++; } if (msg->entries == dm_hit_limit) goto out; /* * We need to create a new entry */ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); memcpy(point->pc, &location, sizeof(void *)); point->count = 1; msg->entries++; if (!timer_pending(&data->send_timer)) { data->send_timer.expires = jiffies + dm_delay * HZ; add_timer(&data->send_timer); } out: raw_spin_unlock_irqrestore(&data->lock, flags); } static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason) { trace_drop_common(skb, location); } static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { struct dm_hw_stat_delta *new_stat; /* * Don't check napi structures with no associated device */ if (!napi->dev) return; rcu_read_lock(); list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { struct net_device *dev; /* * only add a note to our monitor buffer if: * 1) this is the dev we received on * 2) its after the last_rx delta * 3) our rx_dropped count has gone up */ /* Paired with WRITE_ONCE() in dropmon_net_event() */ dev = READ_ONCE(new_stat->dev); if ((dev == napi->dev) && (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { trace_drop_common(NULL, NULL); new_stat->last_drop_val = napi->dev->stats.rx_dropped; new_stat->last_rx = jiffies; break; } } rcu_read_unlock(); } static struct net_dm_hw_entries * net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) { struct net_dm_hw_entries *hw_entries; unsigned long flags; hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit), GFP_KERNEL); if (!hw_entries) { /* If the memory allocation failed, we try to perform another * allocation in 1/10 second. Otherwise, the probe function * will constantly bail out. */ mod_timer(&hw_data->send_timer, jiffies + HZ / 10); } raw_spin_lock_irqsave(&hw_data->lock, flags); swap(hw_data->hw_entries, hw_entries); raw_spin_unlock_irqrestore(&hw_data->lock, flags); return hw_entries; } static int net_dm_hw_entry_put(struct sk_buff *msg, const struct net_dm_hw_entry *hw_entry) { struct nlattr *attr; attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY); if (!attr) return -EMSGSIZE; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_hw_entries_put(struct sk_buff *msg, const struct net_dm_hw_entries *hw_entries) { struct nlattr *attr; int i; attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES); if (!attr) return -EMSGSIZE; for (i = 0; i < hw_entries->num_entries; i++) { int rc; rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]); if (rc) goto nla_put_failure; } nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_hw_summary_report_fill(struct sk_buff *msg, const struct net_dm_hw_entries *hw_entries) { struct net_dm_alert_msg anc_hdr = { 0 }; void *hdr; int rc; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_ALERT); if (!hdr) return -EMSGSIZE; /* We need to put the ancillary header in order not to break user * space. */ if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr)) goto nla_put_failure; rc = net_dm_hw_entries_put(msg, hw_entries); if (rc) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void net_dm_hw_summary_work(struct work_struct *work) { struct net_dm_hw_entries *hw_entries; struct per_cpu_dm_data *hw_data; struct sk_buff *msg; int rc; hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); if (!hw_entries) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) goto out; rc = net_dm_hw_summary_report_fill(msg, hw_entries); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: kfree(hw_entries); } static void net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata) { struct net_dm_hw_entries *hw_entries; struct net_dm_hw_entry *hw_entry; struct per_cpu_dm_data *hw_data; unsigned long flags; int i; if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) return; hw_data = this_cpu_ptr(&dm_hw_cpu_data); raw_spin_lock_irqsave(&hw_data->lock, flags); hw_entries = hw_data->hw_entries; if (!hw_entries) goto out; for (i = 0; i < hw_entries->num_entries; i++) { hw_entry = &hw_entries->entries[i]; if (!strncmp(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) { hw_entry->count++; goto out; } } if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit)) goto out; hw_entry = &hw_entries->entries[hw_entries->num_entries]; strscpy(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1); hw_entry->count = 1; hw_entries->num_entries++; if (!timer_pending(&hw_data->send_timer)) { hw_data->send_timer.expires = jiffies + dm_delay * HZ; add_timer(&hw_data->send_timer); } out: raw_spin_unlock_irqrestore(&hw_data->lock, flags); } static const struct net_dm_alert_ops net_dm_alert_summary_ops = { .kfree_skb_probe = trace_kfree_skb_hit, .napi_poll_probe = trace_napi_poll_hit, .work_item_func = send_dm_alert, .hw_work_item_func = net_dm_hw_summary_work, .hw_trap_probe = net_dm_hw_trap_summary_probe, }; static void net_dm_packet_trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason) { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; struct sk_buff *nskb; unsigned long flags; if (!skb_mac_header_was_set(skb)) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; NET_DM_SKB_CB(nskb)->pc = location; /* Override the timestamp because we care about the time when the * packet was dropped. */ nskb->tstamp = tstamp; data = this_cpu_ptr(&dm_cpu_data); spin_lock_irqsave(&data->drop_queue.lock, flags); if (skb_queue_len(&data->drop_queue) < net_dm_queue_len) __skb_queue_tail(&data->drop_queue, nskb); else goto unlock_free; spin_unlock_irqrestore(&data->drop_queue.lock, flags); schedule_work(&data->dm_alert_work); return; unlock_free: spin_unlock_irqrestore(&data->drop_queue.lock, flags); u64_stats_update_begin(&data->stats.syncp); data->stats.dropped++; u64_stats_update_end(&data->stats.syncp); consume_skb(nskb); } static void net_dm_packet_trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { } static size_t net_dm_in_port_size(void) { /* NET_DM_ATTR_IN_PORT nest */ return nla_total_size(0) + /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PORT_NETDEV_NAME */ nla_total_size(IFNAMSIZ + 1); } #define NET_DM_MAX_SYMBOL_LEN 40 static size_t net_dm_packet_report_size(size_t payload_len) { size_t size; size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); return NLMSG_ALIGN(size) + /* NET_DM_ATTR_ORIGIN */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_PC */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_SYMBOL */ nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) + /* NET_DM_ATTR_IN_PORT */ net_dm_in_port_size() + /* NET_DM_ATTR_TIMESTAMP */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_ORIG_LEN */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex, const char *name) { struct nlattr *attr; attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT); if (!attr) return -EMSGSIZE; if (ifindex && nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex)) goto nla_put_failure; if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc; char buf[NET_DM_MAX_SYMBOL_LEN]; struct nlattr *attr; void *hdr; int rc; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_PACKET_ALERT); if (!hdr) return -EMSGSIZE; if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD)) goto nla_put_failure; snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc); if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) goto nla_put_failure; rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL); if (rc) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) goto nla_put_failure; if (!payload_len) goto out; if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) goto nla_put_failure; attr = skb_put(msg, nla_total_size(payload_len)); attr->nla_type = NET_DM_ATTR_PAYLOAD; attr->nla_len = nla_attr_size(payload_len); if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) goto nla_put_failure; out: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } #define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO) static void net_dm_packet_report(struct sk_buff *skb) { struct sk_buff *msg; size_t payload_len; int rc; /* Make sure we start copying the packet from the MAC header */ if (skb->data > skb_mac_header(skb)) skb_push(skb, skb->data - skb_mac_header(skb)); else skb_pull(skb, skb_mac_header(skb) - skb->data); /* Ensure packet fits inside a single netlink attribute */ payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL); if (!msg) goto out; rc = net_dm_packet_report_fill(msg, skb, payload_len); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: consume_skb(skb); } static void net_dm_packet_work(struct work_struct *work) { struct per_cpu_dm_data *data; struct sk_buff_head list; struct sk_buff *skb; unsigned long flags; data = container_of(work, struct per_cpu_dm_data, dm_alert_work); __skb_queue_head_init(&list); spin_lock_irqsave(&data->drop_queue.lock, flags); skb_queue_splice_tail_init(&data->drop_queue, &list); spin_unlock_irqrestore(&data->drop_queue.lock, flags); while ((skb = __skb_dequeue(&list))) net_dm_packet_report(skb); } static size_t net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata) { return hw_metadata->fa_cookie ? nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0; } static size_t net_dm_hw_packet_report_size(size_t payload_len, const struct devlink_trap_metadata *hw_metadata) { size_t size; size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); return NLMSG_ALIGN(size) + /* NET_DM_ATTR_ORIGIN */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */ nla_total_size(strlen(hw_metadata->trap_group_name) + 1) + /* NET_DM_ATTR_HW_TRAP_NAME */ nla_total_size(strlen(hw_metadata->trap_name) + 1) + /* NET_DM_ATTR_IN_PORT */ net_dm_in_port_size() + /* NET_DM_ATTR_FLOW_ACTION_COOKIE */ net_dm_flow_action_cookie_size(hw_metadata) + /* NET_DM_ATTR_TIMESTAMP */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_ORIG_LEN */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } static int net_dm_hw_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { struct devlink_trap_metadata *hw_metadata; struct nlattr *attr; void *hdr; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_PACKET_ALERT); if (!hdr) return -EMSGSIZE; if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW)) goto nla_put_failure; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME, hw_metadata->trap_group_name)) goto nla_put_failure; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_metadata->trap_name)) goto nla_put_failure; if (hw_metadata->input_dev) { struct net_device *dev = hw_metadata->input_dev; int rc; rc = net_dm_packet_report_in_port_put(msg, dev->ifindex, dev->name); if (rc) goto nla_put_failure; } if (hw_metadata->fa_cookie && nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE, hw_metadata->fa_cookie->cookie_len, hw_metadata->fa_cookie->cookie)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) goto nla_put_failure; if (!payload_len) goto out; if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) goto nla_put_failure; attr = skb_put(msg, nla_total_size(payload_len)); attr->nla_type = NET_DM_ATTR_PAYLOAD; attr->nla_len = nla_attr_size(payload_len); if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) goto nla_put_failure; out: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct devlink_trap_metadata * net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) { const struct flow_action_cookie *fa_cookie; struct devlink_trap_metadata *hw_metadata; const char *trap_group_name; const char *trap_name; hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); if (!hw_metadata) return NULL; trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC); if (!trap_group_name) goto free_hw_metadata; hw_metadata->trap_group_name = trap_group_name; trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC); if (!trap_name) goto free_trap_group; hw_metadata->trap_name = trap_name; if (metadata->fa_cookie) { size_t cookie_size = sizeof(*fa_cookie) + metadata->fa_cookie->cookie_len; fa_cookie = kmemdup(metadata->fa_cookie, cookie_size, GFP_ATOMIC); if (!fa_cookie) goto free_trap_name; hw_metadata->fa_cookie = fa_cookie; } hw_metadata->input_dev = metadata->input_dev; dev_hold(hw_metadata->input_dev); return hw_metadata; free_trap_name: kfree(trap_name); free_trap_group: kfree(trap_group_name); free_hw_metadata: kfree(hw_metadata); return NULL; } static void net_dm_hw_metadata_free(const struct devlink_trap_metadata *hw_metadata) { dev_put(hw_metadata->input_dev); kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); kfree(hw_metadata); } static void net_dm_hw_packet_report(struct sk_buff *skb) { struct devlink_trap_metadata *hw_metadata; struct sk_buff *msg; size_t payload_len; int rc; if (skb->data > skb_mac_header(skb)) skb_push(skb, skb->data - skb_mac_header(skb)); else skb_pull(skb, skb_mac_header(skb) - skb->data); payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata), GFP_KERNEL); if (!msg) goto out; rc = net_dm_hw_packet_report_fill(msg, skb, payload_len); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata); consume_skb(skb); } static void net_dm_hw_packet_work(struct work_struct *work) { struct per_cpu_dm_data *hw_data; struct sk_buff_head list; struct sk_buff *skb; unsigned long flags; hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); __skb_queue_head_init(&list); spin_lock_irqsave(&hw_data->drop_queue.lock, flags); skb_queue_splice_tail_init(&hw_data->drop_queue, &list); spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); while ((skb = __skb_dequeue(&list))) net_dm_hw_packet_report(skb); } static void net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata) { struct devlink_trap_metadata *n_hw_metadata; ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *hw_data; struct sk_buff *nskb; unsigned long flags; if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) return; if (!skb_mac_header_was_set(skb)) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; n_hw_metadata = net_dm_hw_metadata_copy(metadata); if (!n_hw_metadata) goto free; NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata; nskb->tstamp = tstamp; hw_data = this_cpu_ptr(&dm_hw_cpu_data); spin_lock_irqsave(&hw_data->drop_queue.lock, flags); if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len) __skb_queue_tail(&hw_data->drop_queue, nskb); else goto unlock_free; spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); schedule_work(&hw_data->dm_alert_work); return; unlock_free: spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); u64_stats_update_begin(&hw_data->stats.syncp); hw_data->stats.dropped++; u64_stats_update_end(&hw_data->stats.syncp); net_dm_hw_metadata_free(n_hw_metadata); free: consume_skb(nskb); } static const struct net_dm_alert_ops net_dm_alert_packet_ops = { .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit, .napi_poll_probe = net_dm_packet_trace_napi_poll_hit, .work_item_func = net_dm_packet_work, .hw_work_item_func = net_dm_hw_packet_work, .hw_trap_probe = net_dm_hw_trap_packet_probe, }; static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = { [NET_DM_ALERT_MODE_SUMMARY] = &net_dm_alert_summary_ops, [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops, }; #if IS_ENABLED(CONFIG_NET_DEVLINK) static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) { return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL); } static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) { unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL); tracepoint_synchronize_unregister(); } #else static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) { return -EOPNOTSUPP; } static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) { } #endif static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu, rc; if (monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled"); return -EAGAIN; } ops = net_dm_alert_ops_arr[net_dm_alert_mode]; if (!try_module_get(THIS_MODULE)) { NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); return -ENODEV; } for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct net_dm_hw_entries *hw_entries; INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func); timer_setup(&hw_data->send_timer, sched_send_work, 0); hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); kfree(hw_entries); } rc = net_dm_hw_probe_register(ops); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint"); goto err_module_put; } monitor_hw = true; return 0; err_module_put: for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; net_dm_hw_metadata_free(hw_metadata); consume_skb(skb); } } module_put(THIS_MODULE); return rc; } static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu; if (!monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled"); return; } ops = net_dm_alert_ops_arr[net_dm_alert_mode]; monitor_hw = false; net_dm_hw_probe_unregister(ops); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; net_dm_hw_metadata_free(hw_metadata); consume_skb(skb); } } module_put(THIS_MODULE); } static int net_dm_trace_on_set(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu, rc; ops = net_dm_alert_ops_arr[net_dm_alert_mode]; if (!try_module_get(THIS_MODULE)) { NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); return -ENODEV; } for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; INIT_WORK(&data->dm_alert_work, ops->work_item_func); timer_setup(&data->send_timer, sched_send_work, 0); /* Allocate a new per-CPU skb for the summary alert message and * free the old one which might contain stale data from * previous tracing. */ skb = reset_per_cpu_data(data); consume_skb(skb); } rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint"); goto err_module_put; } rc = register_trace_napi_poll(ops->napi_poll_probe, NULL); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint"); goto err_unregister_trace; } return 0; err_unregister_trace: unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); err_module_put: for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); } module_put(THIS_MODULE); return rc; } static void net_dm_trace_off_set(void) { struct dm_hw_stat_delta *new_stat, *temp; const struct net_dm_alert_ops *ops; int cpu; ops = net_dm_alert_ops_arr[net_dm_alert_mode]; unregister_trace_napi_poll(ops->napi_poll_probe, NULL); unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); tracepoint_synchronize_unregister(); /* Make sure we do not send notifications to user space after request * to stop tracing returns. */ for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); } list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { if (new_stat->dev == NULL) { list_del_rcu(&new_stat->list); kfree_rcu(new_stat, rcu); } } module_put(THIS_MODULE); } static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack) { int rc = 0; if (state == trace_state) { NL_SET_ERR_MSG_MOD(extack, "Trace state already set to requested state"); return -EAGAIN; } switch (state) { case TRACE_ON: rc = net_dm_trace_on_set(extack); break; case TRACE_OFF: net_dm_trace_off_set(); break; default: rc = 1; break; } if (!rc) trace_state = state; else rc = -EINPROGRESS; return rc; } static bool net_dm_is_monitoring(void) { return trace_state == TRACE_ON || monitor_hw; } static int net_dm_alert_mode_get_from_info(struct genl_info *info, enum net_dm_alert_mode *p_alert_mode) { u8 val; val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]); switch (val) { case NET_DM_ALERT_MODE_SUMMARY: case NET_DM_ALERT_MODE_PACKET: *p_alert_mode = val; break; default: return -EINVAL; } return 0; } static int net_dm_alert_mode_set(struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; enum net_dm_alert_mode alert_mode; int rc; if (!info->attrs[NET_DM_ATTR_ALERT_MODE]) return 0; rc = net_dm_alert_mode_get_from_info(info, &alert_mode); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode"); return -EINVAL; } net_dm_alert_mode = alert_mode; return 0; } static void net_dm_trunc_len_set(struct genl_info *info) { if (!info->attrs[NET_DM_ATTR_TRUNC_LEN]) return; net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]); } static void net_dm_queue_len_set(struct genl_info *info) { if (!info->attrs[NET_DM_ATTR_QUEUE_LEN]) return; net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]); } static int net_dm_cmd_config(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; int rc; if (net_dm_is_monitoring()) { NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring"); return -EBUSY; } rc = net_dm_alert_mode_set(info); if (rc) return rc; net_dm_trunc_len_set(info); net_dm_queue_len_set(info); return 0; } static int net_dm_monitor_start(bool set_sw, bool set_hw, struct netlink_ext_ack *extack) { bool sw_set = false; int rc; if (set_sw) { rc = set_all_monitor_traces(TRACE_ON, extack); if (rc) return rc; sw_set = true; } if (set_hw) { rc = net_dm_hw_monitor_start(extack); if (rc) goto err_monitor_hw; } return 0; err_monitor_hw: if (sw_set) set_all_monitor_traces(TRACE_OFF, extack); return rc; } static void net_dm_monitor_stop(bool set_sw, bool set_hw, struct netlink_ext_ack *extack) { if (set_hw) net_dm_hw_monitor_stop(extack); if (set_sw) set_all_monitor_traces(TRACE_OFF, extack); } static int net_dm_cmd_trace(struct sk_buff *skb, struct genl_info *info) { bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS]; bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS]; struct netlink_ext_ack *extack = info->extack; /* To maintain backward compatibility, we start / stop monitoring of * software drops if no flag is specified. */ if (!set_sw && !set_hw) set_sw = true; switch (info->genlhdr->cmd) { case NET_DM_CMD_START: return net_dm_monitor_start(set_sw, set_hw, extack); case NET_DM_CMD_STOP: net_dm_monitor_stop(set_sw, set_hw, extack); return 0; } return -EOPNOTSUPP; } static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info) { void *hdr; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW); if (!hdr) return -EMSGSIZE; if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; int rc; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rc = net_dm_config_fill(msg, info); if (rc) goto free_msg; return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); return rc; } static void net_dm_stats_read(struct net_dm_stats *stats) { int cpu; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct net_dm_stats *cpu_stats = &data->stats; unsigned int start; u64 dropped; do { start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); dropped = cpu_stats->dropped; } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); stats->dropped += dropped; } } static int net_dm_stats_put(struct sk_buff *msg) { struct net_dm_stats stats; struct nlattr *attr; net_dm_stats_read(&stats); attr = nla_nest_start(msg, NET_DM_ATTR_STATS); if (!attr) return -EMSGSIZE; if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, stats.dropped, NET_DM_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static void net_dm_hw_stats_read(struct net_dm_stats *stats) { int cpu; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct net_dm_stats *cpu_stats = &hw_data->stats; unsigned int start; u64 dropped; do { start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); dropped = cpu_stats->dropped; } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); stats->dropped += dropped; } } static int net_dm_hw_stats_put(struct sk_buff *msg) { struct net_dm_stats stats; struct nlattr *attr; net_dm_hw_stats_read(&stats); attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS); if (!attr) return -EMSGSIZE; if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, stats.dropped, NET_DM_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info) { void *hdr; int rc; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW); if (!hdr) return -EMSGSIZE; rc = net_dm_stats_put(msg); if (rc) goto nla_put_failure; rc = net_dm_hw_stats_put(msg); if (rc) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; int rc; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rc = net_dm_stats_fill(msg, info); if (rc) goto free_msg; return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); return rc; } static int dropmon_net_event(struct notifier_block *ev_block, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dm_hw_stat_delta *new_stat = NULL; struct dm_hw_stat_delta *tmp; switch (event) { case NETDEV_REGISTER: new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); if (!new_stat) goto out; new_stat->dev = dev; new_stat->last_rx = jiffies; mutex_lock(&net_dm_mutex); list_add_rcu(&new_stat->list, &hw_stats_list); mutex_unlock(&net_dm_mutex); break; case NETDEV_UNREGISTER: mutex_lock(&net_dm_mutex); list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { if (new_stat->dev == dev) { /* Paired with READ_ONCE() in trace_napi_poll_hit() */ WRITE_ONCE(new_stat->dev, NULL); if (trace_state == TRACE_OFF) { list_del_rcu(&new_stat->list); kfree_rcu(new_stat, rcu); break; } } } mutex_unlock(&net_dm_mutex); break; } out: return NOTIFY_DONE; } static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = { [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 }, [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 }, [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 }, [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 }, [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG }, [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG }, }; static const struct genl_small_ops dropmon_ops[] = { { .cmd = NET_DM_CMD_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_config, .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_STOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_CONFIG_GET, .doit = net_dm_cmd_config_get, }, { .cmd = NET_DM_CMD_STATS_GET, .doit = net_dm_cmd_stats_get, }, }; static int net_dm_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { mutex_lock(&net_dm_mutex); return 0; } static void net_dm_nl_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { mutex_unlock(&net_dm_mutex); } static struct genl_family net_drop_monitor_family __ro_after_init = { .hdrsize = 0, .name = "NET_DM", .version = 2, .maxattr = NET_DM_ATTR_MAX, .policy = net_dm_nl_policy, .pre_doit = net_dm_nl_pre_doit, .post_doit = net_dm_nl_post_doit, .module = THIS_MODULE, .small_ops = dropmon_ops, .n_small_ops = ARRAY_SIZE(dropmon_ops), .mcgrps = dropmon_mcgrps, .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), }; static struct notifier_block dropmon_net_notifier = { .notifier_call = dropmon_net_event }; static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) { raw_spin_lock_init(&data->lock); skb_queue_head_init(&data->drop_queue); u64_stats_init(&data->stats.syncp); } static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data) { WARN_ON(!skb_queue_empty(&data->drop_queue)); } static void net_dm_cpu_data_init(int cpu) { struct per_cpu_dm_data *data; data = &per_cpu(dm_cpu_data, cpu); __net_dm_cpu_data_init(data); } static void net_dm_cpu_data_fini(int cpu) { struct per_cpu_dm_data *data; data = &per_cpu(dm_cpu_data, cpu); /* At this point, we should have exclusive access * to this struct and can free the skb inside it. */ consume_skb(data->skb); __net_dm_cpu_data_fini(data); } static void net_dm_hw_cpu_data_init(int cpu) { struct per_cpu_dm_data *hw_data; hw_data = &per_cpu(dm_hw_cpu_data, cpu); __net_dm_cpu_data_init(hw_data); } static void net_dm_hw_cpu_data_fini(int cpu) { struct per_cpu_dm_data *hw_data; hw_data = &per_cpu(dm_hw_cpu_data, cpu); kfree(hw_data->hw_entries); __net_dm_cpu_data_fini(hw_data); } static int __init init_net_drop_monitor(void) { int cpu, rc; pr_info("Initializing network drop monitor service\n"); if (sizeof(void *) > 8) { pr_err("Unable to store program counters on this arch, Drop monitor failed\n"); return -ENOSPC; } for_each_possible_cpu(cpu) { net_dm_cpu_data_init(cpu); net_dm_hw_cpu_data_init(cpu); } rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); return rc; } rc = genl_register_family(&net_drop_monitor_family); if (rc) { pr_err("Could not create drop monitor netlink family\n"); goto out_unreg; } WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = 0; goto out; out_unreg: WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); out: return rc; } static void exit_net_drop_monitor(void) { int cpu; /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ BUG_ON(genl_unregister_family(&net_drop_monitor_family)); BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } } module_init(init_net_drop_monitor); module_exit(exit_net_drop_monitor); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); MODULE_ALIAS_GENL_FAMILY("NET_DM"); MODULE_DESCRIPTION("Monitoring code for network dropped packet alerts"); |
6 6 6 2796 2799 2796 2804 2 623 623 427 2794 2802 427 621 200 198 199 1772 1777 425 434 7 7 5 4 4 74 74 42 42 42 42 42 42 11 3 8 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /** * idmap_key struct holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /** * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /** * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /** * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } static u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /** * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /** * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } static u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /** * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /** * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup(map->forward, map->nr_extents * sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); |
81 6 82 81 6 81 81 81 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/char_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/init.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/kobject.h> #include <linux/kobj_map.h> #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/tty.h> #include "internal.h" static struct kobj_map *cdev_map; static DEFINE_MUTEX(chrdevs_lock); #define CHRDEV_MAJOR_HASH_SIZE 255 static struct char_device_struct { struct char_device_struct *next; unsigned int major; unsigned int baseminor; int minorct; char name[64]; struct cdev *cdev; /* will die */ } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; mutex_lock(&chrdevs_lock); for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); } mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ static int find_dynamic_major(void) { int i; struct char_device_struct *cd; for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) { if (chrdevs[i] == NULL) return i; } for (i = CHRDEV_MAJOR_DYN_EXT_START; i >= CHRDEV_MAJOR_DYN_EXT_END; i--) { for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) if (cd->major == i) break; if (cd == NULL) return i; } return -EBUSY; } /* * Register a single major with a specified minor range. * * If major == 0 this function will dynamically allocate an unused major. * If major > 0 this function will attempt to reserve the range of minors * with given major. * */ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { struct char_device_struct *cd, *curr, *prev = NULL; int ret; int i; if (major >= CHRDEV_MAJOR_MAX) { pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", name, major, CHRDEV_MAJOR_MAX-1); return ERR_PTR(-EINVAL); } if (minorct > MINORMASK + 1 - baseminor) { pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n", name, baseminor, baseminor + minorct - 1, 0, MINORMASK); return ERR_PTR(-EINVAL); } cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); mutex_lock(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); goto out; } major = ret; } ret = -EBUSY; i = major_to_index(major); for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) { if (curr->major < major) continue; if (curr->major > major) break; if (curr->baseminor + curr->minorct <= baseminor) continue; if (curr->baseminor >= baseminor + minorct) break; goto out; } cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; strlcpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; chrdevs[i] = cd; } else { cd->next = prev->next; prev->next = cd; } mutex_unlock(&chrdevs_lock); return cd; out: mutex_unlock(&chrdevs_lock); kfree(cd); return ERR_PTR(ret); } static struct char_device_struct * __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) { struct char_device_struct *cd = NULL, **cp; int i = major_to_index(major); mutex_lock(&chrdevs_lock); for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major == major && (*cp)->baseminor == baseminor && (*cp)->minorct == minorct) break; if (*cp) { cd = *cp; *cp = cd->next; } mutex_unlock(&chrdevs_lock); return cd; } /** * register_chrdev_region() - register a range of device numbers * @from: the first in the desired range of device numbers; must include * the major number. * @count: the number of consecutive device numbers required * @name: the name of the device or driver. * * Return value is zero on success, a negative error code on failure. */ int register_chrdev_region(dev_t from, unsigned count, const char *name) { struct char_device_struct *cd; dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; cd = __register_chrdev_region(MAJOR(n), MINOR(n), next - n, name); if (IS_ERR(cd)) goto fail; } return 0; fail: to = n; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } return PTR_ERR(cd); } /** * alloc_chrdev_region() - register a range of char device numbers * @dev: output parameter for first assigned number * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: the name of the associated device or driver * * Allocates a range of char device numbers. The major number will be * chosen dynamically, and returned (along with the first minor number) * in @dev. Returns zero or a negative error code. */ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { struct char_device_struct *cd; cd = __register_chrdev_region(0, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); *dev = MKDEV(cd->major, cd->baseminor); return 0; } /** * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * * If @major == 0 this functions will dynamically allocate a major and return * its number. * * If @major > 0 this function will attempt to reserve a device with the given * major number and will return zero on success. * * Returns a -ve errno on failure. * * The name of this device has nothing to do with the name of the device in * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. */ int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; int err = -ENOMEM; cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); cdev = cdev_alloc(); if (!cdev) goto out2; cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; cd->cdev = cdev; return major ? 0 : cd->major; out: kobject_put(&cdev->kobj); out2: kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } /** * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * * This function will unregister a range of @count device numbers, * starting with @from. The caller should normally be the one who * allocated those numbers in the first place... */ void unregister_chrdev_region(dev_t from, unsigned count) { dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } } /** * __unregister_chrdev - unregister and destroy a cdev * @major: major device number * @baseminor: first of the range of minor numbers * @count: the number of minor numbers this cdev is occupying * @name: name of this range of devices * * Unregister and destroy the cdev occupying the region described by * @major, @baseminor and @count. This function undoes what * __register_chrdev() did. */ void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct char_device_struct *cd; cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); } static DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { struct module *owner = p->owner; struct kobject *kobj; if (owner && !try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; } void cdev_put(struct cdev *p) { if (p) { struct module *owner = p->owner; kobject_put(&p->kobj); module_put(owner); } } /* * Called every time a character special file is opened */ static int chrdev_open(struct inode *inode, struct file *filp) { const struct file_operations *fops; struct cdev *p; struct cdev *new = NULL; int ret = 0; spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { struct kobject *kobj; int idx; spin_unlock(&cdev_lock); kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); if (!kobj) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); /* Check i_cdev again in case somebody beat us to it while we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) ret = -ENXIO; } else if (!cdev_get(p)) ret = -ENXIO; spin_unlock(&cdev_lock); cdev_put(new); if (ret) return ret; ret = -ENXIO; fops = fops_get(p->ops); if (!fops) goto out_cdev_put; replace_fops(filp, fops); if (filp->f_op->open) { ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } return 0; out_cdev_put: cdev_put(p); return ret; } void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } static void cdev_purge(struct cdev *cdev) { spin_lock(&cdev_lock); while (!list_empty(&cdev->list)) { struct inode *inode; inode = container_of(cdev->list.next, struct inode, i_devices); list_del_init(&inode->i_devices); inode->i_cdev = NULL; } spin_unlock(&cdev_lock); } /* * Dummy default file-operations: the only thing this does * is contain the open that then fills in the correct operations * depending on the special file... */ const struct file_operations def_chr_fops = { .open = chrdev_open, .llseek = noop_llseek, }; static struct kobject *exact_match(dev_t dev, int *part, void *data) { struct cdev *p = data; return &p->kobj; } static int exact_lock(dev_t dev, void *data) { struct cdev *p = data; return cdev_get(p) ? 0 : -1; } /** * cdev_add() - add a char device to the system * @p: the cdev structure for the device * @dev: the first device number for which this device is responsible * @count: the number of consecutive minor numbers corresponding to this * device * * cdev_add() adds the device represented by @p to the system, making it * live immediately. A negative error code is returned on failure. */ int cdev_add(struct cdev *p, dev_t dev, unsigned count) { int error; p->dev = dev; p->count = count; if (WARN_ON(dev == WHITEOUT_DEV)) return -EBUSY; error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) return error; kobject_get(p->kobj.parent); return 0; } /** * cdev_set_parent() - set the parent kobject for a char device * @p: the cdev structure * @kobj: the kobject to take a reference to * * cdev_set_parent() sets a parent kobject which will be referenced * appropriately so the parent is not freed before the cdev. This * should be called before cdev_add. */ void cdev_set_parent(struct cdev *p, struct kobject *kobj) { WARN_ON(!kobj->state_initialized); p->kobj.parent = kobj; } /** * cdev_device_add() - add a char device and it's corresponding * struct device, linkink * @dev: the device structure * @cdev: the cdev structure * * cdev_device_add() adds the char device represented by @cdev to the system, * just as cdev_add does. It then adds @dev to the system using device_add * The dev_t for the char device will be taken from the struct device which * needs to be initialized first. This helper function correctly takes a * reference to the parent device so the parent will not get released until * all references to the cdev are released. * * This helper uses dev->devt for the device number. If it is not set * it will not add the cdev and it will be equivalent to device_add. * * This function should be used whenever the struct cdev and the * struct device are members of the same structure whose lifetime is * managed by the struct device. * * NOTE: Callers must assume that userspace was able to open the cdev and * can call cdev fops callbacks at any time, even if this function fails. */ int cdev_device_add(struct cdev *cdev, struct device *dev) { int rc = 0; if (dev->devt) { cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) return rc; } rc = device_add(dev); if (rc && dev->devt) cdev_del(cdev); return rc; } /** * cdev_device_del() - inverse of cdev_device_add * @dev: the device structure * @cdev: the cdev structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. * * If dev->devt is not set it will not remove the cdev and will be equivalent * to device_del. * * NOTE: This guarantees that associated sysfs callbacks are not running * or runnable, however any cdevs already open will remain and their fops * will still be callable even after this function returns. */ void cdev_device_del(struct cdev *cdev, struct device *dev) { device_del(dev); if (dev->devt) cdev_del(cdev); } static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); } /** * cdev_del() - remove a cdev from the system * @p: the cdev structure to be removed * * cdev_del() removes @p from the system, possibly freeing the structure * itself. * * NOTE: This guarantees that cdev device will no longer be able to be * opened, however any cdevs already open will remain and their fops will * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { cdev_unmap(p->dev, p->count); kobject_put(&p->kobj); } static void cdev_default_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kobject_put(parent); } static void cdev_dynamic_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kfree(p); kobject_put(parent); } static struct kobj_type ktype_cdev_default = { .release = cdev_default_release, }; static struct kobj_type ktype_cdev_dynamic = { .release = cdev_dynamic_release, }; /** * cdev_alloc() - allocate a cdev structure * * Allocates and returns a cdev structure, or NULL on failure. */ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); if (p) { INIT_LIST_HEAD(&p->list); kobject_init(&p->kobj, &ktype_cdev_dynamic); } return p; } /** * cdev_init() - initialize a cdev structure * @cdev: the structure to initialize * @fops: the file_operations for this device * * Initializes @cdev, remembering @fops, making it ready to add to the * system with cdev_add(). */ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); INIT_LIST_HEAD(&cdev->list); kobject_init(&cdev->kobj, &ktype_cdev_default); cdev->ops = fops; } static struct kobject *base_probe(dev_t dev, int *part, void *data) { if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) /* Make old-style 2.4 aliases work */ request_module("char-major-%d", MAJOR(dev)); return NULL; } void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); } /* Let modules do char dev stuff */ EXPORT_SYMBOL(register_chrdev_region); EXPORT_SYMBOL(unregister_chrdev_region); EXPORT_SYMBOL(alloc_chrdev_region); EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_set_parent); EXPORT_SYMBOL(cdev_device_add); EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev); |
50 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 | // SPDX-License-Identifier: GPL-2.0 /* -*- linux-c -*- * sysctl_net_core.c: sysctl interface to net core subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/core directory entry (empty =) ). [MS] */ #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/ratelimit.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> static int two = 2; static int three = 3; static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; static long long_one __maybe_unused = 1; static long long_max __maybe_unused = LONG_MAX; static int net_msg_warn; /* Unused, but still a sysctl */ int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); /* 0 - Keep current behavior: * IPv4: inherit all current settings from init_net * IPv6: reset all settings to default * 1 - Both inherit all current settings from init_net * 2 - Both reset all settings to default * 3 - Both inherit all settings from current netns */ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; struct ctl_table tmp = { .data = &size, .maxlen = sizeof(size), .mode = table->mode }; struct rps_sock_flow_table *orig_sock_table, *sock_table; static DEFINE_MUTEX(sock_flow_mutex); mutex_lock(&sock_flow_mutex); orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { if (size) { if (size > 1<<29) { /* Enforce limit to prevent overflow */ mutex_unlock(&sock_flow_mutex); return -EINVAL; } size = roundup_pow_of_two(size); if (size != orig_size) { sock_table = vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); if (!sock_table) { mutex_unlock(&sock_flow_mutex); return -ENOMEM; } rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; for (i = 0; i < size; i++) sock_table->ents[i] = RPS_NO_CPU; } else sock_table = NULL; if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); } if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); synchronize_rcu(); vfree(orig_sock_table); } } } mutex_unlock(&sock_flow_mutex); return ret; } #endif /* CONFIG_RPS */ #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; cpumask_var_t mask; int i, len, ret = 0; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; if (write) { ret = cpumask_parse(buffer, mask); if (ret) goto done; mutex_lock(&flow_limit_update_mutex); len = sizeof(*cur) + netdev_flow_limit_table_len; for_each_possible_cpu(i) { sd = &per_cpu(softnet_data, i); cur = rcu_dereference_protected(sd->flow_limit, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); synchronize_rcu(); kfree(cur); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); if (!cur) { /* not unwinding previous changes */ ret = -ENOMEM; goto write_unlock; } cur->num_buckets = netdev_flow_limit_table_len; rcu_assign_pointer(sd->flow_limit, cur); } } write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { char kbuf[128]; if (*ppos || !*lenp) { *lenp = 0; goto done; } cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { sd = &per_cpu(softnet_data, i); if (rcu_dereference(sd->flow_limit)) cpumask_set_cpu(i, mask); } rcu_read_unlock(); len = min(sizeof(kbuf) - 1, *lenp); len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); if (!len) { *lenp = 0; goto done; } if (len < *lenp) kbuf[len++] = '\n'; memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; } done: free_cpumask_var(mask); return ret; } static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; mutex_lock(&flow_limit_update_mutex); ptr = table->data; old = *ptr; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write && !is_power_of_2(*ptr)) { *ptr = old; ret = -EINVAL; } mutex_unlock(&flow_limit_update_mutex); return ret; } #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_SCHED static int set_default_qdisc(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { .data = id, .maxlen = IFNAMSIZ, }; int ret; qdisc_get_default(id, IFNAMSIZ); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = qdisc_set_default(id); return ret; } #endif static int proc_do_dev_weight(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { static DEFINE_MUTEX(dev_weight_mutex); int ret, weight; mutex_lock(&dev_weight_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); return ret; } static int proc_do_rss_key(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); fake_table.data = buf; fake_table.maxlen = sizeof(buf); return proc_dostring(&fake_table, write, buffer, lenp, ppos); } #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; struct ctl_table tmp = *table; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; tmp.data = &jit_enable; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (jit_enable < 2 || (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) { *(int *)table->data = jit_enable; if (jit_enable == 2) pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); } else { ret = -EPERM; } } return ret; } # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } # endif /* CONFIG_HAVE_EBPF_JIT */ static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif static struct ctl_table net_core_table[] = { { .procname = "wmem_max", .data = &sysctl_wmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, }, { .procname = "rmem_max", .data = &sysctl_rmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, { .procname = "wmem_default", .data = &sysctl_wmem_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, }, { .procname = "rmem_default", .data = &sysctl_rmem_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, { .procname = "dev_weight", .data = &weight_p, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_rx_bias", .data = &dev_weight_rx_bias, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_tx_bias", .data = &dev_weight_tx_bias, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, .extra1 = SYSCTL_ONE, }, { .procname = "netdev_max_backlog", .data = &netdev_max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "netdev_rss_key", .data = &netdev_rss_key, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_do_rss_key, }, #ifdef CONFIG_BPF_JIT { .procname = "bpf_jit_enable", .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_bpf_enable, # ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, # else .extra1 = SYSCTL_ZERO, .extra2 = &two, # endif }, # ifdef CONFIG_HAVE_EBPF_JIT { .procname = "bpf_jit_harden", .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, # endif { .procname = "bpf_jit_limit", .data = &bpf_jit_limit, .maxlen = sizeof(long), .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, .extra1 = &long_one, .extra2 = &bpf_jit_limit_max, }, #endif { .procname = "netdev_tstamp_prequeue", .data = &netdev_tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "message_cost", .data = &net_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "message_burst", .data = &net_ratelimit_state.burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "optmem_max", .data = &sysctl_optmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tstamp_allow_data", .data = &sysctl_tstamp_allow_data, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", .maxlen = sizeof(int), .mode = 0644, .proc_handler = rps_sock_flow_sysctl }, #endif #ifdef CONFIG_NET_FLOW_LIMIT { .procname = "flow_limit_cpu_bitmap", .mode = 0644, .proc_handler = flow_limit_cpu_sysctl }, { .procname = "flow_limit_table_len", .data = &netdev_flow_limit_table_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = flow_limit_table_len_sysctl }, #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_RX_BUSY_POLL { .procname = "busy_poll", .data = &sysctl_net_busy_poll, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "busy_read", .data = &sysctl_net_busy_read, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_NET_SCHED { .procname = "default_qdisc", .mode = 0644, .maxlen = IFNAMSIZ, .proc_handler = set_default_qdisc }, #endif { .procname = "netdev_budget", .data = &netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "warnings", .data = &net_msg_warn, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_skb_frags", .data = &sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &max_skb_frags, }, { .procname = "netdev_budget_usecs", .data = &netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "fb_tunnels_only_for_init_net", .data = &sysctl_fb_tunnels_only_for_init_net, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { .procname = "devconf_inherit_init_net", .data = &sysctl_devconf_inherit_init_net, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &three, }, { .procname = "high_order_alloc_disable", .data = &net_high_order_alloc_disable_key.key, .maxlen = sizeof(net_high_order_alloc_disable_key), .mode = 0644, .proc_handler = proc_do_static_key, }, { .procname = "gro_normal_batch", .data = &gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "netdev_unregister_timeout_secs", .data = &netdev_unregister_timeout_secs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, { } }; static struct ctl_table netns_core_table[] = { { .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, { } }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) { /* fallback tunnels for initns only */ if (!strncmp(str, "initns", 6)) sysctl_fb_tunnels_only_for_init_net = 1; /* no fallback tunnels anywhere */ else if (!strncmp(str, "none", 4)) sysctl_fb_tunnels_only_for_init_net = 2; return 1; } __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; tbl[0].data = &net->core.sysctl_somaxconn; /* Don't export any sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { tbl[0].procname = NULL; } } net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); if (net->core.sysctl_hdr == NULL) goto err_reg; return 0; err_reg: if (tbl != netns_core_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_core_net_exit(struct net *net) { struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); BUG_ON(tbl == netns_core_table); kfree(tbl); } static __net_initdata struct pernet_operations sysctl_core_ops = { .init = sysctl_core_net_init, .exit = sysctl_core_net_exit, }; static __init int sysctl_core_init(void) { register_net_sysctl(&init_net, "net/core", net_core_table); return register_pernet_subsys(&sysctl_core_ops); } fs_initcall(sysctl_core_init); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | /* SPDX-License-Identifier: GPL-2.0 */ /* include/net/dsfield.h - Manipulation of the Differentiated Services field */ /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ #ifndef __NET_DSFIELD_H #define __NET_DSFIELD_H #include <linux/types.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <asm/byteorder.h> static inline __u8 ipv4_get_dsfield(const struct iphdr *iph) { return iph->tos; } static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h) { return ntohs(*(__force const __be16 *)ipv6h) >> 4; } static inline void ipv4_change_dsfield(struct iphdr *iph,__u8 mask, __u8 value) { __u32 check = ntohs((__force __be16)iph->check); __u8 dsfield; dsfield = (iph->tos & mask) | value; check += iph->tos; if ((check+1) >> 16) check = (check+1) & 0xffff; check -= dsfield; check += check >> 16; /* adjust carry */ iph->check = (__force __sum16)htons(check); iph->tos = dsfield; } static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask, __u8 value) { __be16 *p = (__force __be16 *)ipv6h; *p = (*p & htons((((u16)mask << 4) | 0xf00f))) | htons((u16)value << 4); } #endif |
50 50 410 109 6 3 3 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 | // SPDX-License-Identifier: GPL-2.0 /* * class.c - basic device class management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2003-2004 Greg Kroah-Hartman * Copyright (c) 2003-2004 IBM Corp. */ #include <linux/device/class.h> #include <linux/device.h> #include <linux/module.h> #include <linux/init.h> #include <linux/string.h> #include <linux/kdev_t.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/genhd.h> #include <linux/mutex.h> #include "base.h" #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr) static ssize_t class_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct class_attribute *class_attr = to_class_attr(attr); struct subsys_private *cp = to_subsys_private(kobj); ssize_t ret = -EIO; if (class_attr->show) ret = class_attr->show(cp->class, class_attr, buf); return ret; } static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct class_attribute *class_attr = to_class_attr(attr); struct subsys_private *cp = to_subsys_private(kobj); ssize_t ret = -EIO; if (class_attr->store) ret = class_attr->store(cp->class, class_attr, buf, count); return ret; } static void class_release(struct kobject *kobj) { struct subsys_private *cp = to_subsys_private(kobj); struct class *class = cp->class; pr_debug("class '%s': release.\n", class->name); if (class->class_release) class->class_release(class); else pr_debug("class '%s' does not have a release() function, " "be careful\n", class->name); kfree(cp); } static const struct kobj_ns_type_operations *class_child_ns_type(struct kobject *kobj) { struct subsys_private *cp = to_subsys_private(kobj); struct class *class = cp->class; return class->ns_type; } static const struct sysfs_ops class_sysfs_ops = { .show = class_attr_show, .store = class_attr_store, }; static struct kobj_type class_ktype = { .sysfs_ops = &class_sysfs_ops, .release = class_release, .child_ns_type = class_child_ns_type, }; /* Hotplug events for classes go to the class subsys */ static struct kset *class_kset; int class_create_file_ns(struct class *cls, const struct class_attribute *attr, const void *ns) { int error; if (cls) error = sysfs_create_file_ns(&cls->p->subsys.kobj, &attr->attr, ns); else error = -EINVAL; return error; } void class_remove_file_ns(struct class *cls, const struct class_attribute *attr, const void *ns) { if (cls) sysfs_remove_file_ns(&cls->p->subsys.kobj, &attr->attr, ns); } static struct class *class_get(struct class *cls) { if (cls) kset_get(&cls->p->subsys); return cls; } static void class_put(struct class *cls) { if (cls) kset_put(&cls->p->subsys); } static struct device *klist_class_to_dev(struct klist_node *n) { struct device_private *p = to_device_private_class(n); return p->device; } static void klist_class_dev_get(struct klist_node *n) { struct device *dev = klist_class_to_dev(n); get_device(dev); } static void klist_class_dev_put(struct klist_node *n) { struct device *dev = klist_class_to_dev(n); put_device(dev); } static int class_add_groups(struct class *cls, const struct attribute_group **groups) { return sysfs_create_groups(&cls->p->subsys.kobj, groups); } static void class_remove_groups(struct class *cls, const struct attribute_group **groups) { return sysfs_remove_groups(&cls->p->subsys.kobj, groups); } int __class_register(struct class *cls, struct lock_class_key *key) { struct subsys_private *cp; int error; pr_debug("device class '%s': registering\n", cls->name); cp = kzalloc(sizeof(*cp), GFP_KERNEL); if (!cp) return -ENOMEM; klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put); INIT_LIST_HEAD(&cp->interfaces); kset_init(&cp->glue_dirs); __mutex_init(&cp->mutex, "subsys mutex", key); error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name); if (error) { kfree(cp); return error; } /* set the default /sys/dev directory for devices of this class */ if (!cls->dev_kobj) cls->dev_kobj = sysfs_dev_char_kobj; #if defined(CONFIG_BLOCK) /* let the block class directory show up in the root of sysfs */ if (!sysfs_deprecated || cls != &block_class) cp->subsys.kobj.kset = class_kset; #else cp->subsys.kobj.kset = class_kset; #endif cp->subsys.kobj.ktype = &class_ktype; cp->class = cls; cls->p = cp; error = kset_register(&cp->subsys); if (error) { kfree(cp); return error; } error = class_add_groups(class_get(cls), cls->class_groups); class_put(cls); if (error) { kobject_del(&cp->subsys.kobj); kfree_const(cp->subsys.kobj.name); kfree(cp); } return error; } EXPORT_SYMBOL_GPL(__class_register); void class_unregister(struct class *cls) { pr_debug("device class '%s': unregistering\n", cls->name); class_remove_groups(cls, cls->class_groups); kset_unregister(&cls->p->subsys); } static void class_create_release(struct class *cls) { pr_debug("%s called for %s\n", __func__, cls->name); kfree(cls); } /** * __class_create - create a struct class structure * @owner: pointer to the module that is to "own" this struct class * @name: pointer to a string for the name of this class. * @key: the lock_class_key for this class; used by mutex lock debugging * * This is used to create a struct class pointer that can then be used * in calls to device_create(). * * Returns &struct class pointer on success, or ERR_PTR() on error. * * Note, the pointer created here is to be destroyed when finished by * making a call to class_destroy(). */ struct class *__class_create(struct module *owner, const char *name, struct lock_class_key *key) { struct class *cls; int retval; cls = kzalloc(sizeof(*cls), GFP_KERNEL); if (!cls) { retval = -ENOMEM; goto error; } cls->name = name; cls->owner = owner; cls->class_release = class_create_release; retval = __class_register(cls, key); if (retval) goto error; return cls; error: kfree(cls); return ERR_PTR(retval); } EXPORT_SYMBOL_GPL(__class_create); /** * class_destroy - destroys a struct class structure * @cls: pointer to the struct class that is to be destroyed * * Note, the pointer to be destroyed must have been created with a call * to class_create(). */ void class_destroy(struct class *cls) { if ((cls == NULL) || (IS_ERR(cls))) return; class_unregister(cls); } /** * class_dev_iter_init - initialize class device iterator * @iter: class iterator to initialize * @class: the class we wanna iterate over * @start: the device to start iterating from, if any * @type: device_type of the devices to iterate over, NULL for all * * Initialize class iterator @iter such that it iterates over devices * of @class. If @start is set, the list iteration will start there, * otherwise if it is NULL, the iteration starts at the beginning of * the list. */ void class_dev_iter_init(struct class_dev_iter *iter, struct class *class, struct device *start, const struct device_type *type) { struct klist_node *start_knode = NULL; if (start) start_knode = &start->p->knode_class; klist_iter_init_node(&class->p->klist_devices, &iter->ki, start_knode); iter->type = type; } EXPORT_SYMBOL_GPL(class_dev_iter_init); /** * class_dev_iter_next - iterate to the next device * @iter: class iterator to proceed * * Proceed @iter to the next device and return it. Returns NULL if * iteration is complete. * * The returned device is referenced and won't be released till * iterator is proceed to the next device or exited. The caller is * free to do whatever it wants to do with the device including * calling back into class code. */ struct device *class_dev_iter_next(struct class_dev_iter *iter) { struct klist_node *knode; struct device *dev; while (1) { knode = klist_next(&iter->ki); if (!knode) return NULL; dev = klist_class_to_dev(knode); if (!iter->type || iter->type == dev->type) return dev; } } EXPORT_SYMBOL_GPL(class_dev_iter_next); /** * class_dev_iter_exit - finish iteration * @iter: class iterator to finish * * Finish an iteration. Always call this function after iteration is * complete whether the iteration ran till the end or not. */ void class_dev_iter_exit(struct class_dev_iter *iter) { klist_iter_exit(&iter->ki); } EXPORT_SYMBOL_GPL(class_dev_iter_exit); /** * class_for_each_device - device iterator * @class: the class we're iterating * @start: the device to start with in the list, if any. * @data: data for the callback * @fn: function to be called for each device * * Iterate over @class's list of devices, and call @fn for each, * passing it @data. If @start is set, the list iteration will start * there, otherwise if it is NULL, the iteration starts at the * beginning of the list. * * We check the return of @fn each time. If it returns anything * other than 0, we break out and return that value. * * @fn is allowed to do anything including calling back into class * code. There's no locking restriction. */ int class_for_each_device(struct class *class, struct device *start, void *data, int (*fn)(struct device *, void *)) { struct class_dev_iter iter; struct device *dev; int error = 0; if (!class) return -EINVAL; if (!class->p) { WARN(1, "%s called for class '%s' before it was initialized", __func__, class->name); return -EINVAL; } class_dev_iter_init(&iter, class, start, NULL); while ((dev = class_dev_iter_next(&iter))) { error = fn(dev, data); if (error) break; } class_dev_iter_exit(&iter); return error; } EXPORT_SYMBOL_GPL(class_for_each_device); /** * class_find_device - device iterator for locating a particular device * @class: the class we're iterating * @start: Device to begin with * @data: data for the match function * @match: function to check device * * This is similar to the class_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. * * Note, you will need to drop the reference with put_device() after use. * * @match is allowed to do anything including calling back into class * code. There's no locking restriction. */ struct device *class_find_device(struct class *class, struct device *start, const void *data, int (*match)(struct device *, const void *)) { struct class_dev_iter iter; struct device *dev; if (!class) return NULL; if (!class->p) { WARN(1, "%s called for class '%s' before it was initialized", __func__, class->name); return NULL; } class_dev_iter_init(&iter, class, start, NULL); while ((dev = class_dev_iter_next(&iter))) { if (match(dev, data)) { get_device(dev); break; } } class_dev_iter_exit(&iter); return dev; } EXPORT_SYMBOL_GPL(class_find_device); int class_interface_register(struct class_interface *class_intf) { struct class *parent; struct class_dev_iter iter; struct device *dev; if (!class_intf || !class_intf->class) return -ENODEV; parent = class_get(class_intf->class); if (!parent) return -EINVAL; mutex_lock(&parent->p->mutex); list_add_tail(&class_intf->node, &parent->p->interfaces); if (class_intf->add_dev) { class_dev_iter_init(&iter, parent, NULL, NULL); while ((dev = class_dev_iter_next(&iter))) class_intf->add_dev(dev, class_intf); class_dev_iter_exit(&iter); } mutex_unlock(&parent->p->mutex); return 0; } void class_interface_unregister(struct class_interface *class_intf) { struct class *parent = class_intf->class; struct class_dev_iter iter; struct device *dev; if (!parent) return; mutex_lock(&parent->p->mutex); list_del_init(&class_intf->node); if (class_intf->remove_dev) { class_dev_iter_init(&iter, parent, NULL, NULL); while ((dev = class_dev_iter_next(&iter))) class_intf->remove_dev(dev, class_intf); class_dev_iter_exit(&iter); } mutex_unlock(&parent->p->mutex); class_put(parent); } ssize_t show_class_attr_string(struct class *class, struct class_attribute *attr, char *buf) { struct class_attribute_string *cs; cs = container_of(attr, struct class_attribute_string, attr); return sysfs_emit(buf, "%s\n", cs->str); } EXPORT_SYMBOL_GPL(show_class_attr_string); struct class_compat { struct kobject *kobj; }; /** * class_compat_register - register a compatibility class * @name: the name of the class * * Compatibility class are meant as a temporary user-space compatibility * workaround when converting a family of class devices to a bus devices. */ struct class_compat *class_compat_register(const char *name) { struct class_compat *cls; cls = kmalloc(sizeof(struct class_compat), GFP_KERNEL); if (!cls) return NULL; cls->kobj = kobject_create_and_add(name, &class_kset->kobj); if (!cls->kobj) { kfree(cls); return NULL; } return cls; } EXPORT_SYMBOL_GPL(class_compat_register); /** * class_compat_unregister - unregister a compatibility class * @cls: the class to unregister */ void class_compat_unregister(struct class_compat *cls) { kobject_put(cls->kobj); kfree(cls); } EXPORT_SYMBOL_GPL(class_compat_unregister); /** * class_compat_create_link - create a compatibility class device link to * a bus device * @cls: the compatibility class * @dev: the target bus device * @device_link: an optional device to which a "device" link should be created */ int class_compat_create_link(struct class_compat *cls, struct device *dev, struct device *device_link) { int error; error = sysfs_create_link(cls->kobj, &dev->kobj, dev_name(dev)); if (error) return error; /* * Optionally add a "device" link (typically to the parent), as a * class device would have one and we want to provide as much * backwards compatibility as possible. */ if (device_link) { error = sysfs_create_link(&dev->kobj, &device_link->kobj, "device"); if (error) sysfs_remove_link(cls->kobj, dev_name(dev)); } return error; } EXPORT_SYMBOL_GPL(class_compat_create_link); /** * class_compat_remove_link - remove a compatibility class device link to * a bus device * @cls: the compatibility class * @dev: the target bus device * @device_link: an optional device to which a "device" link was previously * created */ void class_compat_remove_link(struct class_compat *cls, struct device *dev, struct device *device_link) { if (device_link) sysfs_remove_link(&dev->kobj, "device"); sysfs_remove_link(cls->kobj, dev_name(dev)); } EXPORT_SYMBOL_GPL(class_compat_remove_link); int __init classes_init(void) { class_kset = kset_create_and_add("class", NULL, NULL); if (!class_kset) return -ENOMEM; return 0; } EXPORT_SYMBOL_GPL(class_create_file_ns); EXPORT_SYMBOL_GPL(class_remove_file_ns); EXPORT_SYMBOL_GPL(class_unregister); EXPORT_SYMBOL_GPL(class_destroy); EXPORT_SYMBOL_GPL(class_interface_register); EXPORT_SYMBOL_GPL(class_interface_unregister); |
7 7 32 26 26 26 161 6 6 26 26 26 26 26 167 166 167 1 6 9 5 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 | // SPDX-License-Identifier: GPL-2.0-only /* * Process number limiting controller for cgroups. * * Used to allow a cgroup hierarchy to stop any new processes from fork()ing * after a certain limit is reached. * * Since it is trivial to hit the task limit without hitting any kmemcg limits * in place, PIDs are a fundamental resource. As such, PID exhaustion must be * preventable in the scope of a cgroup hierarchy by allowing resource limiting * of the number of tasks in a cgroup. * * In order to use the `pids` controller, set the maximum number of tasks in * pids.max (this is not available in the root cgroup for obvious reasons). The * number of processes currently in the cgroup is given by pids.current. * Organisational operations are not blocked by cgroup policies, so it is * possible to have pids.current > pids.max. However, it is not possible to * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking * would cause a cgroup policy to be violated. * * To set a cgroup to have no limit, set pids.max to "max". This is the default * for all new cgroups (N.B. that PID limits are hierarchical, so the most * stringent limit in the hierarchy is followed). * * pids.current tracks all child cgroup hierarchies, so parent/pids.current is * a superset of parent/child/pids.current. * * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/atomic.h> #include <linux/cgroup.h> #include <linux/slab.h> #include <linux/sched/task.h> #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" struct pids_cgroup { struct cgroup_subsys_state css; /* * Use 64-bit types so that we can safely represent "max" as * %PIDS_MAX = (%PID_MAX_LIMIT + 1). */ atomic64_t counter; atomic64_t limit; /* Handle for "pids.events" */ struct cgroup_file events_file; /* Number of times fork failed because limit was hit. */ atomic64_t events_limit; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) { return container_of(css, struct pids_cgroup, css); } static struct pids_cgroup *parent_pids(struct pids_cgroup *pids) { return css_pids(pids->css.parent); } static struct cgroup_subsys_state * pids_css_alloc(struct cgroup_subsys_state *parent) { struct pids_cgroup *pids; pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); if (!pids) return ERR_PTR(-ENOMEM); atomic64_set(&pids->counter, 0); atomic64_set(&pids->limit, PIDS_MAX); atomic64_set(&pids->events_limit, 0); return &pids->css; } static void pids_css_free(struct cgroup_subsys_state *css) { kfree(css_pids(css)); } /** * pids_cancel - uncharge the local pid count * @pids: the pid cgroup state * @num: the number of pids to cancel * * This function will WARN if the pid count goes under 0, because such a case is * a bug in the pids controller proper. */ static void pids_cancel(struct pids_cgroup *pids, int num) { /* * A negative count (or overflow for that matter) is invalid, * and indicates a bug in the `pids` controller proper. */ WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter)); } /** * pids_uncharge - hierarchically uncharge the pid count * @pids: the pid cgroup state * @num: the number of pids to uncharge */ static void pids_uncharge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; for (p = pids; parent_pids(p); p = parent_pids(p)) pids_cancel(p, num); } /** * pids_charge - hierarchically charge the pid count * @pids: the pid cgroup state * @num: the number of pids to charge * * This function does *not* follow the pid limit set. It cannot fail and the new * pid count may exceed the limit. This is only used for reverting failed * attaches, where there is no other way out than violating the limit. */ static void pids_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; for (p = pids; parent_pids(p); p = parent_pids(p)) atomic64_add(num, &p->counter); } /** * pids_try_charge - hierarchically try to charge the pid count * @pids: the pid cgroup state * @num: the number of pids to charge * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge * succeeded, otherwise -EAGAIN. */ static int pids_try_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p, *q; for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); int64_t limit = atomic64_read(&p->limit); /* * Since new is capped to the maximum number of pid_t, if * p->limit is %PIDS_MAX then we know that this test will never * fail. */ if (new > limit) goto revert; } return 0; revert: for (q = pids; q != p; q = parent_pids(q)) pids_cancel(q, num); pids_cancel(p, num); return -EAGAIN; } static int pids_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *dst_css; cgroup_taskset_for_each(task, dst_css, tset) { struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; /* * No need to pin @old_css between here and cancel_attach() * because cgroup core protects it from being freed before * the migration completes or fails. */ old_css = task_css(task, pids_cgrp_id); old_pids = css_pids(old_css); pids_charge(pids, 1); pids_uncharge(old_pids, 1); } return 0; } static void pids_cancel_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *dst_css; cgroup_taskset_for_each(task, dst_css, tset) { struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; old_css = task_css(task, pids_cgrp_id); old_pids = css_pids(old_css); pids_charge(old_pids, 1); pids_uncharge(pids, 1); } } /* * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on cgroup_threadgroup_change_begin() held by the copy_process(). */ static int pids_can_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; int err; if (cset) css = cset->subsys[pids_cgrp_id]; else css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); err = pids_try_charge(pids, 1); if (err) { /* Only log the first time events_limit is incremented. */ if (atomic64_inc_return(&pids->events_limit) == 1) { pr_info("cgroup: fork rejected by pids controller in "); pr_cont_cgroup_path(css->cgroup); pr_cont("\n"); } cgroup_file_notify(&pids->events_file); } return err; } static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; if (cset) css = cset->subsys[pids_cgrp_id]; else css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); pids_uncharge(pids, 1); } static void pids_release(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); pids_uncharge(pids, 1); } static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_subsys_state *css = of_css(of); struct pids_cgroup *pids = css_pids(css); int64_t limit; int err; buf = strstrip(buf); if (!strcmp(buf, PIDS_MAX_STR)) { limit = PIDS_MAX; goto set_limit; } err = kstrtoll(buf, 0, &limit); if (err) return err; if (limit < 0 || limit >= PIDS_MAX) return -EINVAL; set_limit: /* * Limit updates don't need to be mutex'd, since it isn't * critical that any racing fork()s follow the new limit. */ atomic64_set(&pids->limit, limit); return nbytes; } static int pids_max_show(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct pids_cgroup *pids = css_pids(css); int64_t limit = atomic64_read(&pids->limit); if (limit >= PIDS_MAX) seq_printf(sf, "%s\n", PIDS_MAX_STR); else seq_printf(sf, "%lld\n", limit); return 0; } static s64 pids_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct pids_cgroup *pids = css_pids(css); return atomic64_read(&pids->counter); } static int pids_events_show(struct seq_file *sf, void *v) { struct pids_cgroup *pids = css_pids(seq_css(sf)); seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); return 0; } static struct cftype pids_files[] = { { .name = "max", .write = pids_max_write, .seq_show = pids_max_show, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "current", .read_s64 = pids_current_read, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "events", .seq_show = pids_events_show, .file_offset = offsetof(struct pids_cgroup, events_file), .flags = CFTYPE_NOT_ON_ROOT, }, { } /* terminate */ }; struct cgroup_subsys pids_cgrp_subsys = { .css_alloc = pids_css_alloc, .css_free = pids_css_free, .can_attach = pids_can_attach, .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, .release = pids_release, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, .threaded = true, }; |
930 930 923 929 1 1 926 930 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ /* * fsnotify inode mark locking/lifetime/and refcnting * * REFCNT: * The group->recnt and mark->refcnt tell how many "things" in the kernel * currently are referencing the objects. Both kind of objects typically will * live inside the kernel with a refcnt of 2, one for its creation and one for * the reference a group and a mark hold to each other. * If you are holding the appropriate locks, you can take a reference and the * object itself is guaranteed to survive until the reference is dropped. * * LOCKING: * There are 3 locks involved with fsnotify inode marks and they MUST be taken * in order as follows: * * group->mark_mutex * mark->lock * mark->connector->lock * * group->mark_mutex protects the marks_list anchored inside a given group and * each mark is hooked via the g_list. It also protects the groups private * data (i.e group limits). * mark->lock protects the marks attributes like its masks and flags. * Furthermore it protects the access to a reference of the group that the mark * is assigned to as well as the access to a reference of the inode/vfsmount * that is being watched by the mark. * * mark->connector->lock protects the list of marks anchored inside an * inode / vfsmount and each mark is hooked via the i_list. * * A list of notification marks relating to inode / mnt is contained in * fsnotify_mark_connector. That structure is alive as long as there are any * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets * detached from fsnotify_mark_connector when last reference to the mark is * dropped. Thus having mark reference is enough to protect mark->connector * pointer and to make sure fsnotify_mark_connector cannot disappear. Also * because we remove mark from g_list before dropping mark reference associated * with that, any mark found through g_list is guaranteed to have * mark->connector set until we drop group->mark_mutex. * * LIFETIME: * Inode marks survive between when they are added to an inode and when their * refcnt==0. Marks are also protected by fsnotify_mark_srcu. * * The inode mark can be cleared for a number of different reasons including: * - The inode is unlinked for the last time. (fsnotify_inode_remove) * - The inode is being evicted from cache. (fsnotify_inode_delete) * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) * - The fsnotify_group associated with the mark is going away and all such marks * need to be cleaned up. (fsnotify_clear_marks_by_group) * * This has the very interesting property of being able to run concurrently with * any (or all) other directions. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/srcu.h> #include <linux/ratelimit.h> #include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ struct srcu_struct fsnotify_mark_srcu; struct kmem_cache *fsnotify_mark_connector_cachep; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); static struct fsnotify_mark_connector *connector_destroy_list; static void fsnotify_mark_destroy_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); static void fsnotify_connector_destroy_workfn(struct work_struct *work); static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); void fsnotify_get_mark(struct fsnotify_mark *mark) { WARN_ON_ONCE(!refcount_read(&mark->refcnt)); refcount_inc(&mark->refcnt); } static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) { if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) return &fsnotify_conn_inode(conn)->i_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) return &fsnotify_conn_sb(conn)->s_fsnotify_mask; return NULL; } __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn) { if (WARN_ON(!fsnotify_valid_obj_type(conn->type))) return 0; return *fsnotify_conn_mask_p(conn); } static void fsnotify_get_inode_ref(struct inode *inode) { ihold(inode); atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); } /* * Grab or drop inode reference for the connector if needed. * * When it's time to drop the reference, we only clear the HAS_IREF flag and * return the inode object. fsnotify_drop_object() will be resonsible for doing * iput() outside of spinlocks. This happens when last mark that wanted iref is * detached. */ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn, bool want_iref) { bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF; struct inode *inode = NULL; if (conn->type != FSNOTIFY_OBJ_TYPE_INODE || want_iref == has_iref) return NULL; if (want_iref) { /* Pin inode if any mark wants inode refcount held */ fsnotify_get_inode_ref(fsnotify_conn_inode(conn)); conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF; } else { /* Unpin inode after detach of last mark that wanted iref */ inode = fsnotify_conn_inode(conn); conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF; } return inode; } static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; bool want_iref = false; struct fsnotify_mark *mark; assert_spin_locked(&conn->lock); /* We can get detached connector here when inode is getting unlinked. */ if (!fsnotify_valid_obj_type(conn->type)) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) continue; new_mask |= fsnotify_calc_mask(mark); if (conn->type == FSNOTIFY_OBJ_TYPE_INODE && !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) want_iref = true; } *fsnotify_conn_mask_p(conn) = new_mask; return fsnotify_update_iref(conn, want_iref); } static bool fsnotify_conn_watches_children( struct fsnotify_mark_connector *conn) { if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return false; return fsnotify_inode_watches_children(fsnotify_conn_inode(conn)); } static void fsnotify_conn_set_children_dentry_flags( struct fsnotify_mark_connector *conn) { if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return; fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn)); } /* * Calculate mask of events for a list of marks. The caller must make sure * connector and connector->obj cannot disappear under us. Callers achieve * this by holding a mark->lock or mark->group->mark_mutex for a mark on this * list. */ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { bool update_children; if (!conn) return; spin_lock(&conn->lock); update_children = !fsnotify_conn_watches_children(conn); __fsnotify_recalc_mask(conn); update_children &= fsnotify_conn_watches_children(conn); spin_unlock(&conn->lock); /* * Set children's PARENT_WATCHED flags only if parent started watching. * When parent stops watching, we clear false positive PARENT_WATCHED * flags lazily in __fsnotify_parent(). */ if (update_children) fsnotify_conn_set_children_dentry_flags(conn); } /* Free all connectors queued for freeing once SRCU period ends */ static void fsnotify_connector_destroy_workfn(struct work_struct *work) { struct fsnotify_mark_connector *conn, *free; spin_lock(&destroy_lock); conn = connector_destroy_list; connector_destroy_list = NULL; spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); while (conn) { free = conn; conn = conn->destroy_next; kmem_cache_free(fsnotify_mark_connector_cachep, free); } } static void fsnotify_put_inode_ref(struct inode *inode) { struct super_block *sb = inode->i_sb; iput(inode); if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) wake_up_var(&sb->s_fsnotify_connectors); } static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn) { struct super_block *sb = fsnotify_connector_sb(conn); if (sb) atomic_long_inc(&sb->s_fsnotify_connectors); } static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn) { struct super_block *sb = fsnotify_connector_sb(conn); if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) wake_up_var(&sb->s_fsnotify_connectors); } static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) { struct inode *inode = NULL; *type = conn->type; if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) return NULL; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; /* Unpin inode when detaching from connector */ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF)) inode = NULL; } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } fsnotify_put_sb_connectors(conn); rcu_assign_pointer(*(conn->obj), NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; return inode; } static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; if (WARN_ON_ONCE(!group)) return; group->ops->free_mark(mark); fsnotify_put_group(group); } /* Drop object reference originally held by a connector */ static void fsnotify_drop_object(unsigned int type, void *objp) { if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) { struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector); void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ if (!conn) { if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; } /* * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock)) return; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { objp = __fsnotify_recalc_mask(conn); type = conn->type; } WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); fsnotify_drop_object(type, objp); if (free_conn) { spin_lock(&destroy_lock); conn->destroy_next = connector_destroy_list; connector_destroy_list = conn; spin_unlock(&destroy_lock); queue_work(system_unbound_wq, &connector_reaper_work); } /* * Note that we didn't update flags telling whether inode cares about * what's happening with children. We update these flags from * __fsnotify_parent() lazily when next event happens on one of our * children. */ spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); queue_delayed_work(system_unbound_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* * Get mark reference when we found the mark via lockless traversal of object * list. Mark can be already removed from the list by now and on its way to be * destroyed once SRCU period ends. * * Also pin the group so it doesn't disappear under us. */ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { if (!mark) return true; if (refcount_inc_not_zero(&mark->refcnt)) { spin_lock(&mark->lock); if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) { /* mark is attached, group is still alive then */ atomic_inc(&mark->group->user_waits); spin_unlock(&mark->lock); return true; } spin_unlock(&mark->lock); fsnotify_put_mark(mark); } return false; } /* * Puts marks and wakes up group destruction if necessary. * * Pairs with fsnotify_get_mark_safe() */ static void fsnotify_put_mark_wake(struct fsnotify_mark *mark) { if (mark) { struct fsnotify_group *group = mark->group; fsnotify_put_mark(mark); /* * We abuse notification_waitq on group shutdown for waiting for * all marks pinned when waiting for userspace. */ if (atomic_dec_and_test(&group->user_waits) && group->shutdown) wake_up(&group->notification_waitq); } } bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) __releases(&fsnotify_mark_srcu) { int type; fsnotify_foreach_iter_type(type) { /* This can fail if mark is being removed */ if (!fsnotify_get_mark_safe(iter_info->marks[type])) { __release(&fsnotify_mark_srcu); goto fail; } } /* * Now that both marks are pinned by refcount in the inode / vfsmount * lists, we can drop SRCU lock, and safely resume the list iteration * once userspace returns. */ srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); return true; fail: for (type--; type >= 0; type--) fsnotify_put_mark_wake(iter_info->marks[type]); return false; } void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) __acquires(&fsnotify_mark_srcu) { int type; iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); fsnotify_foreach_iter_type(type) fsnotify_put_mark_wake(iter_info->marks[type]); } /* * Mark mark as detached, remove it from group list. Mark still stays in object * list until its last reference is dropped. Note that we rely on mark being * removed from group list before corresponding reference to it is dropped. In * particular we rely on mark->connector being valid while we hold * group->mark_mutex if we found the mark through g_list. * * Must be called with group->mark_mutex held. The caller must either hold * reference to the mark or be protected by fsnotify_mark_srcu. */ void fsnotify_detach_mark(struct fsnotify_mark *mark) { fsnotify_group_assert_locked(mark->group); WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && refcount_read(&mark->refcnt) < 1 + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; list_del_init(&mark->g_list); spin_unlock(&mark->lock); /* Drop mark reference acquired in fsnotify_add_mark_locked() */ fsnotify_put_mark(mark); } /* * Free fsnotify mark. The mark is actually only marked as being freed. The * freeing is actually happening only once last reference to the mark is * dropped from a workqueue which first waits for srcu period end. * * Caller must have a reference to the mark or be protected by * fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); /* * Some groups like to know that marks are being freed. This is a * callback to the group function to let it know that this mark * is being freed. */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); } void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { fsnotify_group_lock(group); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); } EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); /* * Sorting function for lists of fsnotify marks. * * Fanotify supports different notification classes (reflected as priority of * notification group). Events shall be passed to notification groups in * decreasing priority order. To achieve this marks in notification lists for * inodes and vfsmounts are sorted so that priorities of corresponding groups * are descending. * * Furthermore correct handling of the ignore mask requires processing inode * and vfsmount marks of each group together. Using the group address as * further sort criterion provides a unique sorting order and thus we can * merge inode and vfsmount lists of marks in linear time and find groups * present in both lists. * * A return value of 1 signifies that b has priority over a. * A return value of 0 signifies that the two marks have to be handled together. * A return value of -1 signifies that a has priority over b. */ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) { if (a == b) return 0; if (!a) return 1; if (!b) return -1; if (a->priority < b->priority) return 1; if (a->priority > b->priority) return -1; if (a < b) return 1; return -1; } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, unsigned int obj_type, __kernel_fsid_t *fsid) { struct fsnotify_mark_connector *conn; conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); if (!conn) return -ENOMEM; spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->flags = 0; conn->type = obj_type; conn->obj = connp; /* Cache fsid of filesystem containing the object */ if (fsid) { conn->fsid = *fsid; conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID; } else { conn->fsid.val[0] = conn->fsid.val[1] = 0; conn->flags = 0; } fsnotify_get_sb_connectors(conn); /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ fsnotify_put_sb_connectors(conn); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } return 0; } /* * Get mark connector, make sure it is alive and return with its lock held. * This is for users that get connector pointer from inode or mount. Users that * hold reference to a mark on the list may directly lock connector->lock as * they are sure list cannot go away under them. */ static struct fsnotify_mark_connector *fsnotify_grab_connector( fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; int idx; idx = srcu_read_lock(&fsnotify_mark_srcu); conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (!conn) goto out; spin_lock(&conn->lock); if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) { spin_unlock(&conn->lock); srcu_read_unlock(&fsnotify_mark_srcu, idx); return NULL; } out: srcu_read_unlock(&fsnotify_mark_srcu, idx); return conn; } /* * Add mark into proper place in given list of marks. These marks may be used * for the fsnotify backend to determine which event types should be delivered * to which group and for which inodes. These marks are ordered according to * priority, highest number first, and then by the group's location in memory. */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, int add_flags, __kernel_fsid_t *fsid) { struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; int cmp; int err = 0; if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; /* Backend is expected to check for zero fsid (e.g. tmpfs) */ if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1])) return -ENODEV; restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); err = fsnotify_attach_connector_to_object(connp, obj_type, fsid); if (err) return err; goto restart; } else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) { conn->fsid = *fsid; /* Pairs with smp_rmb() in fanotify_get_fsid() */ smp_wmb(); conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID; } else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) && (fsid->val[0] != conn->fsid.val[0] || fsid->val[1] != conn->fsid.val[1])) { /* * Backend is expected to check for non uniform fsid * (e.g. btrfs), but maybe we missed something? * Only allow setting conn->fsid once to non zero fsid. * inotify and non-fid fanotify groups do not set nor test * conn->fsid. */ pr_warn_ratelimited("%s: fsid mismatch on object of type %u: " "%x.%x != %x.%x\n", __func__, conn->type, fsid->val[0], fsid->val[1], conn->fsid.val[0], conn->fsid.val[1]); err = -EXDEV; goto out_err; } /* is mark the first mark? */ if (hlist_empty(&conn->list)) { hlist_add_head_rcu(&mark->obj_list, &conn->list); goto added; } /* should mark be in the middle of the current list? */ hlist_for_each_entry(lmark, &conn->list, obj_list) { last = lmark; if ((lmark->group == mark->group) && (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) { err = -EEXIST; goto out_err; } cmp = fsnotify_compare_groups(lmark->group, mark->group); if (cmp >= 0) { hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); goto added; } } BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone * seeing mark->connector set. */ WRITE_ONCE(mark->connector, conn); out_err: spin_unlock(&conn->lock); spin_unlock(&mark->lock); return err; } /* * Attach an initialized mark to a given group and fs object. * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, int add_flags, __kernel_fsid_t *fsid) { struct fsnotify_group *group = mark->group; int ret = 0; fsnotify_group_assert_locked(group); /* * LOCKING ORDER!!!! * group->mark_mutex * mark->lock * mark->connector->lock */ spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; list_add(&mark->g_list, &group->marks_list); fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid); if (ret) goto err; fsnotify_recalc_mask(mark->connector); return ret; err: spin_lock(&mark->lock); mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); spin_unlock(&mark->lock); fsnotify_put_mark(mark); return ret; } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, int add_flags, __kernel_fsid_t *fsid) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid); fsnotify_group_unlock(group); return ret; } EXPORT_SYMBOL_GPL(fsnotify_add_mark); /* * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, struct fsnotify_group *group) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; conn = fsnotify_grab_connector(connp); if (!conn) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (mark->group == group && (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); return mark; } } spin_unlock(&conn->lock); return NULL; } EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); struct list_head *head = &to_free; /* Skip selection step if we want to clear all marks. */ if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) { head = &group->marks_list; goto clear; } /* * We have to be really careful here. Anytime we drop mark_mutex, e.g. * fsnotify_clear_marks_by_inode() can come and free marks. Even in our * to_free list so we have to use mark_mutex even when accessing that * list. And freeing mark requires us to drop mark_mutex. So we can * reliably free only the first mark in the list. That's why we first * move marks to free to to_free list in one go and then free marks in * to_free list one by one. */ fsnotify_group_lock(group); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { if (mark->connector->type == obj_type) list_move(&mark->g_list, &to_free); } fsnotify_group_unlock(group); clear: while (1) { fsnotify_group_lock(group); if (list_empty(head)) { fsnotify_group_unlock(group); break; } mark = list_first_entry(head, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); fsnotify_put_mark(mark); } } /* Destroy all marks attached to an object via connector */ void fsnotify_destroy_marks(fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark, *old_mark = NULL; void *objp; unsigned int type; conn = fsnotify_grab_connector(connp); if (!conn) return; /* * We have to be careful since we can race with e.g. * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the * list can get modified. However we are holding mark reference and * thus our mark cannot be removed from obj_list so we can continue * iteration after regaining conn->lock. */ hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); old_mark = mark; fsnotify_destroy_mark(mark, mark->group); spin_lock(&conn->lock); } /* * Detach list from object now so that we don't pin inode until all * mark references get dropped. It would lead to strange results such * as delaying inode deletion or blocking unmount. */ objp = fsnotify_detach_connector_from_object(conn, &type); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); fsnotify_drop_object(type, objp); } /* * Nothing fancy, just initialize lists and locks and counters. */ void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { memset(mark, 0, sizeof(*mark)); spin_lock_init(&mark->lock); refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; WRITE_ONCE(mark->connector, NULL); } EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* * Destroy all marks in destroy_list, waits for SRCU period to finish before * actually freeing marks. */ static void fsnotify_mark_destroy_workfn(struct work_struct *work) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; spin_lock(&destroy_lock); /* exchange the list head */ list_replace_init(&destroy_list, &private_destroy_list); spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { list_del_init(&mark->g_list); fsnotify_final_mark_destroy(mark); } } /* Wait for all marks queued for destruction to be actually destroyed */ void fsnotify_wait_marks_destroyed(void) { flush_delayed_work(&reaper_work); } EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed); |
4 4 3 8 22 22 20 21 21 21 21 21 4 4 4 4 6 14 14 14 2 3 10 9 6 7 2 4 4 6 6 14 14 5 5 5 19 19 15 15 19 19 2 2 2 2 11 11 5 2 3 9 9 3 3 3 2 2 2 1 1 1 1 1 6 6 6 6 6 6 888 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "soft-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/cpumask.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" #include "originator.h" #include "send.h" #include "translation-table.h" /** * batadv_skb_head_push() - Increase header size and move (push) head pointer * @skb: packet buffer which should be modified * @len: number of bytes to add * * Return: 0 on success or negative error number in case of failure */ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) { int result; /* TODO: We must check if we can release all references to non-payload * data using __skb_header_release in our skbs to allow skb_cow_header * to work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer * to write freely in that area. */ result = skb_cow_head(skb, len); if (result < 0) return result; skb_push(skb, len); return 0; } static int batadv_interface_open(struct net_device *dev) { netif_start_queue(dev); return 0; } static int batadv_interface_release(struct net_device *dev) { netif_stop_queue(dev); return 0; } /** * batadv_sum_counter() - Sum the cpu-local counters for index 'idx' * @bat_priv: the bat priv with all the soft interface information * @idx: index of counter to sum up * * Return: sum of all cpu-local counters */ static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { u64 *counters, sum = 0; int cpu; for_each_possible_cpu(cpu) { counters = per_cpu_ptr(bat_priv->bat_counters, cpu); sum += counters[idx]; } return sum; } static struct net_device_stats *batadv_interface_stats(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; stats->tx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_TX); stats->tx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_TX_BYTES); stats->tx_dropped = batadv_sum_counter(bat_priv, BATADV_CNT_TX_DROPPED); stats->rx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_RX); stats->rx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_RX_BYTES); return stats; } static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; struct sockaddr *addr = p; u8 old_addr[ETH_ALEN]; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; ether_addr_copy(old_addr, dev->dev_addr); ether_addr_copy(dev->dev_addr, addr->sa_data); /* only modify transtable if it has been initialized before */ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return 0; rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); batadv_tt_local_add(dev, addr->sa_data, vlan->vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); } rcu_read_unlock(); return 0; } static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { struct batadv_priv *bat_priv = netdev_priv(dev); /* check ranges */ if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; bat_priv->mtu_set_by_user = new_mtu; return 0; } /** * batadv_interface_set_rx_mode() - set the rx mode of a device * @dev: registered network device to modify * * We do not actually need to set any rx filters for the virtual batman * soft interface. However a dummy handler enables a user to set static * multicast listeners for instance. */ static void batadv_interface_set_rx_mode(struct net_device *dev) { } static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, struct net_device *soft_iface) { struct ethhdr *ethhdr; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, 0x00, 0x00}; static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, 0x00, 0x00}; enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO; u8 *dst_hint = NULL, chaddr[ETH_ALEN]; struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; unsigned long brd_delay = 0; bool do_bcast = false, client_added; unsigned short vid; u32 seqno; int gw_mode; enum batadv_forw_mode forw_mode = BATADV_FORW_SINGLE; struct batadv_orig_node *mcast_single_orig = NULL; int mcast_is_routable = 0; int network_offset = ETH_HLEN; __be16 proto; if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); netif_trans_update(soft_iface); vid = batadv_get_vid(skb, 0); skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; switch (ntohs(proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, sizeof(*vhdr))) goto dropped; vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; /* drop batman-in-batman packets to prevent loops */ if (proto != htons(ETH_P_BATMAN)) { network_offset += VLAN_HLEN; break; } fallthrough; case ETH_P_BATMAN: goto dropped; } skb_set_network_header(skb, network_offset); if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; /* skb->data might have been reallocated by batadv_bla_tx() */ ethhdr = eth_hdr(skb); /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source) && !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) { client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, vid, skb->skb_iif, skb->mark); if (!client_added) goto dropped; } /* Snoop address candidates from DHCPACKs for early DAT filling */ batadv_dat_snoop_outgoing_dhcp_ack(bat_priv, skb, proto, vid); /* don't accept stp packets. STP does not help in meshes. * better use the bridge loop avoidance ... * * The same goes for ECTP sent at least by some Cisco Switches, * it might confuse the mesh when used with bridge loop avoidance. */ if (batadv_compare_eth(ethhdr->h_dest, stp_addr)) goto dropped; if (batadv_compare_eth(ethhdr->h_dest, ectp_addr)) goto dropped; gw_mode = atomic_read(&bat_priv->gw.mode); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* if gw mode is off, broadcast every packet */ if (gw_mode == BATADV_GW_MODE_OFF) { do_bcast = true; goto send; } dhcp_rcp = batadv_gw_dhcp_recipient_get(skb, &header_len, chaddr); /* skb->data may have been modified by * batadv_gw_dhcp_recipient_get() */ ethhdr = eth_hdr(skb); /* if gw_mode is on, broadcast any non-DHCP message. * All the DHCP packets are going to be sent as unicast */ if (dhcp_rcp == BATADV_DHCP_NO) { do_bcast = true; goto send; } if (dhcp_rcp == BATADV_DHCP_TO_CLIENT) dst_hint = chaddr; else if ((gw_mode == BATADV_GW_MODE_SERVER) && (dhcp_rcp == BATADV_DHCP_TO_SERVER)) /* gateways should not forward any DHCP message if * directed to a DHCP server */ goto dropped; send: if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { forw_mode = batadv_mcast_forw_mode(bat_priv, skb, &mcast_single_orig, &mcast_is_routable); if (forw_mode == BATADV_FORW_NONE) goto dropped; if (forw_mode == BATADV_FORW_SINGLE || forw_mode == BATADV_FORW_SOME) do_bcast = false; } } batadv_skb_set_priority(skb, 0); /* ethernet packet should be broadcasted */ if (do_bcast) { primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto dropped; /* in case of ARP request, we do not immediately broadcasti the * packet, instead we first wait for DAT to try to retrieve the * correct ARP entry */ if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) brd_delay = msecs_to_jiffies(ARP_REQ_DELAY); if (batadv_skb_head_push(skb, sizeof(*bcast_packet)) < 0) goto dropped; bcast_packet = (struct batadv_bcast_packet *)skb->data; bcast_packet->version = BATADV_COMPAT_VERSION; bcast_packet->ttl = BATADV_TTL - 1; /* batman packet type: broadcast */ bcast_packet->packet_type = BATADV_BCAST; bcast_packet->reserved = 0; /* hw address of first interface is the orig mac because only * this mac is known throughout the mesh */ ether_addr_copy(bcast_packet->orig, primary_if->net_dev->dev_addr); /* set broadcast sequence number */ seqno = atomic_inc_return(&bat_priv->bcast_seqno); bcast_packet->seqno = htonl(seqno); batadv_send_bcast_packet(bat_priv, skb, brd_delay, true); /* unicast packet */ } else { /* DHCP packets going to a server will use the GW feature */ if (dhcp_rcp == BATADV_DHCP_TO_SERVER) { ret = batadv_gw_out_of_range(bat_priv, skb); if (ret) goto dropped; ret = batadv_send_skb_via_gw(bat_priv, skb, vid); } else if (mcast_single_orig) { ret = batadv_mcast_forw_send_orig(bat_priv, skb, vid, mcast_single_orig); } else if (forw_mode == BATADV_FORW_SOME) { ret = batadv_mcast_forw_send(bat_priv, skb, vid, mcast_is_routable); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) goto dropped; batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb); ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint, vid); } if (ret != NET_XMIT_SUCCESS) goto dropped_freed; } batadv_inc_counter(bat_priv, BATADV_CNT_TX); batadv_add_counter(bat_priv, BATADV_CNT_TX_BYTES, data_len); goto end; dropped: kfree_skb(skb); dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: batadv_orig_node_put(mcast_single_orig); batadv_hardif_put(primary_if); return NETDEV_TX_OK; } /** * batadv_interface_rx() - receive ethernet frame on local batman-adv interface * @soft_iface: local interface which will receive the ethernet frame * @skb: ethernet frame for @soft_iface * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. * * The packet may still get dropped. This can happen when the encapsulated * ethernet frame is invalid or contains again an batman-adv packet. Also * unicast packets will be dropped directly when it was sent between two * isolated clients. */ void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_bcast_packet *batadv_bcast_packet; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; int packet_type; batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; packet_type = batadv_bcast_packet->packet_type; skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); /* clean the netfilter state now that the batman-adv header has been * removed */ nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, VLAN_ETH_HLEN)) goto dropped; vhdr = skb_vlan_eth_hdr(skb); /* drop batman-in-batman packets to prevent loops */ if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; fallthrough; case ETH_P_BATMAN: goto dropped; } /* skb->dev & skb->pkt_type are set here */ skb->protocol = eth_type_trans(skb, soft_iface); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); /* Let the bridge loop avoidance check the packet. If will * not handle it, we can safely push it up. */ if (batadv_bla_rx(bat_priv, skb, vid, packet_type)) goto out; if (orig_node) batadv_tt_add_temporary_global_entry(bat_priv, orig_node, ethhdr->h_source, vid); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* set the mark on broadcast packets if AP isolation is ON and * the packet is coming from an "isolated" client */ if (batadv_vlan_ap_isola_get(bat_priv, vid) && batadv_tt_global_is_isolated(bat_priv, ethhdr->h_source, vid)) { /* save bits in skb->mark not covered by the mask and * apply the mark on the rest */ skb->mark &= ~bat_priv->isolation_mark_mask; skb->mark |= bat_priv->isolation_mark; } } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid)) { goto dropped; } netif_rx(skb); goto out; dropped: kfree_skb(skb); out: return; } /** * batadv_softif_vlan_release() - release vlan from lists and queue for free * after rcu grace period * @ref: kref pointer of the vlan object */ void batadv_softif_vlan_release(struct kref *ref) { struct batadv_softif_vlan *vlan; vlan = container_of(ref, struct batadv_softif_vlan, refcount); spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); hlist_del_rcu(&vlan->list); spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); kfree_rcu(vlan, rcu); } /** * batadv_softif_vlan_get() - get the vlan object for a specific vid * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve * * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan_tmp, *vlan = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) { if (vlan_tmp->vid != vid) continue; if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; break; } rcu_read_unlock(); return vlan; } /** * batadv_softif_create_vlan() - allocate the needed resources for a new vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * * Return: 0 on success, a negative error otherwise. */ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan; spin_lock_bh(&bat_priv->softif_vlan_list_lock); vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { batadv_softif_vlan_put(vlan); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); if (!vlan) { spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -ENOMEM; } vlan->bat_priv = bat_priv; vlan->vid = vid; kref_init(&vlan->refcount); atomic_set(&vlan->ap_isolation, 0); kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); /* don't return reference to new softif_vlan */ batadv_softif_vlan_put(vlan); return 0; } /** * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object * @bat_priv: the bat priv with all the soft interface information * @vlan: the object to remove */ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan) { /* explicitly remove the associated TT local entry because it is marked * with the NOPURGE flag */ batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); batadv_softif_vlan_put(vlan); } /** * batadv_interface_add_vid() - ndo_add_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the * mesh interface * * Return: 0 on success or a negative error code in case of failure. */ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; vid |= BATADV_VLAN_HAS_TAG; /* if a new vlan is getting created and it already exists, it means that * it was not deleted yet. batadv_softif_vlan_get() increases the * refcount in order to revive the object. * * if it does not exist then create it. */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) return batadv_softif_create_vlan(bat_priv, vid); /* add a new TT local entry. This one will be marked with the NOPURGE * flag. This must be added again, even if the vlan object already * exists, because the entry was deleted by kill_vid() */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); return 0; } /** * batadv_interface_kill_vid() - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid * on top of the mesh interface * * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q * or -ENOENT if the specified vlan id wasn't registered. */ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. batman-adv does not know how to * handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return -ENOENT; batadv_softif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ batadv_softif_vlan_put(vlan); return 0; } /* batman-adv network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key batadv_netdev_xmit_lock_key; static struct lock_class_key batadv_netdev_addr_lock_key; /** * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue * @dev: device which owns the tx queue * @txq: tx queue to modify * @_unused: always NULL */ static void batadv_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); } /** * batadv_set_lockdep_class() - Set txq and addr_list lockdep class * @dev: network device to modify */ static void batadv_set_lockdep_class(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); } /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify * * Return: error code on failures */ static int batadv_softif_init_late(struct net_device *dev) { struct batadv_priv *bat_priv; u32 random_seqno; int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; batadv_set_lockdep_class(dev); bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called */ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64)); if (!bat_priv->bat_counters) return -ENOMEM; atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bridge_loop_avoidance, 1); #endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); #endif atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); atomic_set(&bat_priv->hop_penalty, 30); #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_set(&bat_priv->log_level, 0); #endif atomic_set(&bat_priv->fragmentation, 1); atomic_set(&bat_priv->packet_size_max, ETH_DATA_LEN); atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN); atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); atomic_set(&bat_priv->bcast_seqno, 1); atomic_set(&bat_priv->tt.vn, 0); atomic_set(&bat_priv->tt.local_changes, 0); atomic_set(&bat_priv->tt.ogm_append_cnt, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); #endif atomic_set(&bat_priv->tp_num, 0); bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; bat_priv->isolation_mark = 0; bat_priv->isolation_mark_mask = 0; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->frag_seqno, random_seqno); bat_priv->primary_if = NULL; batadv_nc_init_bat_priv(bat_priv); if (!bat_priv->algo_ops) { ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) goto free_bat_counters; } ret = batadv_mesh_init(dev); if (ret < 0) goto free_bat_counters; return 0; free_bat_counters: free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; return ret; } /** * batadv_softif_slave_add() - Add a slave interface to a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev); out: batadv_hardif_put(hard_iface); return ret; } /** * batadv_softif_slave_del() - Delete a slave iface from a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_del(struct net_device *dev, struct net_device *slave_dev) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface != dev) goto out; batadv_hardif_disable_interface(hard_iface); ret = 0; out: batadv_hardif_put(hard_iface); return ret; } static const struct net_device_ops batadv_netdev_ops = { .ndo_init = batadv_softif_init_late, .ndo_open = batadv_interface_open, .ndo_stop = batadv_interface_release, .ndo_get_stats = batadv_interface_stats, .ndo_vlan_rx_add_vid = batadv_interface_add_vid, .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid, .ndo_set_mac_address = batadv_interface_set_mac_addr, .ndo_change_mtu = batadv_interface_change_mtu, .ndo_set_rx_mode = batadv_interface_set_rx_mode, .ndo_start_xmit = batadv_interface_tx, .ndo_validate_addr = eth_validate_addr, .ndo_add_slave = batadv_softif_slave_add, .ndo_del_slave = batadv_softif_slave_del, }; static void batadv_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); strscpy(info->bus_info, "batman", sizeof(info->bus_info)); } /* Inspired by drivers/net/ethernet/dlink/sundance.c:1702 * Declare each description string in struct.name[] to get fixed sized buffer * and compile time checking for strings longer than ETH_GSTRING_LEN. */ static const struct { const char name[ETH_GSTRING_LEN]; } batadv_counters_strings[] = { { "tx" }, { "tx_bytes" }, { "tx_dropped" }, { "rx" }, { "rx_bytes" }, { "forward" }, { "forward_bytes" }, { "mgmt_tx" }, { "mgmt_tx_bytes" }, { "mgmt_rx" }, { "mgmt_rx_bytes" }, { "frag_tx" }, { "frag_tx_bytes" }, { "frag_rx" }, { "frag_rx_bytes" }, { "frag_fwd" }, { "frag_fwd_bytes" }, { "tt_request_tx" }, { "tt_request_rx" }, { "tt_response_tx" }, { "tt_response_rx" }, { "tt_roam_adv_tx" }, { "tt_roam_adv_rx" }, #ifdef CONFIG_BATMAN_ADV_DAT { "dat_get_tx" }, { "dat_get_rx" }, { "dat_put_tx" }, { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif #ifdef CONFIG_BATMAN_ADV_NC { "nc_code" }, { "nc_code_bytes" }, { "nc_recode" }, { "nc_recode_bytes" }, { "nc_buffer" }, { "nc_decode" }, { "nc_decode_bytes" }, { "nc_decode_failed" }, { "nc_sniffed" }, #endif }; static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) { if (stringset == ETH_SS_STATS) memcpy(data, batadv_counters_strings, sizeof(batadv_counters_strings)); } static void batadv_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct batadv_priv *bat_priv = netdev_priv(dev); int i; for (i = 0; i < BATADV_CNT_NUM; i++) data[i] = batadv_sum_counter(bat_priv, i); } static int batadv_get_sset_count(struct net_device *dev, int stringset) { if (stringset == ETH_SS_STATS) return BATADV_CNT_NUM; return -EOPNOTSUPP; } static const struct ethtool_ops batadv_ethtool_ops = { .get_drvinfo = batadv_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = batadv_get_strings, .get_ethtool_stats = batadv_get_ethtool_stats, .get_sset_count = batadv_get_sset_count, }; /** * batadv_softif_free() - Deconstructor of batadv_soft_interface * @dev: Device to cleanup and remove */ static void batadv_softif_free(struct net_device *dev) { batadv_mesh_free(dev); /* some scheduled RCU callbacks need the bat_priv struct to accomplish * their tasks. Wait for them all to be finished before freeing the * netdev and its private data (bat_priv) */ rcu_barrier(); } /** * batadv_softif_init_early() - early stage initialization of soft interface * @dev: registered network device to modify */ static void batadv_softif_init_early(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &batadv_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags |= IFF_NO_QUEUE; /* can't call min_mtu, because the needed variables * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; /* generate random address */ eth_hw_addr_random(dev); dev->ethtool_ops = &batadv_ethtool_ops; } /** * batadv_softif_validate() - validate configuration of new batadv link * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_algo_ops *algo_ops; if (!data) return 0; if (data[IFLA_BATADV_ALGO_NAME]) { algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME])); if (!algo_ops) return -EINVAL; } return 0; } /** * batadv_softif_newlink() - pre-initialize and register new batadv link * @src_net: the applicable net namespace * @dev: network device to register * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_priv *bat_priv = netdev_priv(dev); const char *algo_name; int err; if (data && data[IFLA_BATADV_ALGO_NAME]) { algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]); err = batadv_algo_select(bat_priv, algo_name); if (err) return -EINVAL; } return register_netdevice(dev); } /** * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via * netlink * @soft_iface: the to-be-removed batman-adv interface * @head: list pointer */ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, struct list_head *head) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *hard_iface; struct batadv_softif_vlan *vlan; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface == soft_iface) batadv_hardif_disable_interface(hard_iface); } /* destroy the "untagged" VLAN */ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { batadv_softif_destroy_vlan(bat_priv, vlan); batadv_softif_vlan_put(vlan); } unregister_netdevice_queue(soft_iface, head); } /** * batadv_softif_is_valid() - Check whether device is a batadv soft interface * @net_dev: device which should be checked * * Return: true when net_dev is a batman-adv interface, false otherwise */ bool batadv_softif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) return true; return false; } static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = { [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING }, }; struct rtnl_link_ops batadv_link_ops __read_mostly = { .kind = "batadv", .priv_size = sizeof(struct batadv_priv), .setup = batadv_softif_init_early, .maxtype = IFLA_BATADV_MAX, .policy = batadv_ifla_policy, .validate = batadv_softif_validate, .newlink = batadv_softif_newlink, .dellink = batadv_softif_destroy_netlink, }; |
7 2 5 5 1 4 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 | /* * net/tipc/net.c: TIPC network routing code * * Copyright (c) 1995-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "net.h" #include "name_distr.h" #include "subscr.h" #include "socket.h" #include "node.h" #include "bcast.h" #include "link.h" #include "netlink.h" #include "monitor.h" /* * The TIPC locking policy is designed to ensure a very fine locking * granularity, permitting complete parallel access to individual * port and node/link instances. The code consists of four major * locking domains, each protected with their own disjunct set of locks. * * 1: The bearer level. * RTNL lock is used to serialize the process of configuring bearer * on update side, and RCU lock is applied on read side to make * bearer instance valid on both paths of message transmission and * reception. * * 2: The node and link level. * All node instances are saved into two tipc_node_list and node_htable * lists. The two lists are protected by node_list_lock on write side, * and they are guarded with RCU lock on read side. Especially node * instance is destroyed only when TIPC module is removed, and we can * confirm that there has no any user who is accessing the node at the * moment. Therefore, Except for iterating the two lists within RCU * protection, it's no needed to hold RCU that we access node instance * in other places. * * In addition, all members in node structure including link instances * are protected by node spin lock. * * 3: The transport level of the protocol. * This consists of the structures port, (and its user level * representations, such as user_port and tipc_sock), reference and * tipc_user (port.c, reg.c, socket.c). * * This layer has four different locks: * - The tipc_port spin_lock. This is protecting each port instance * from parallel data access and removal. Since we can not place * this lock in the port itself, it has been placed in the * corresponding reference table entry, which has the same life * cycle as the module. This entry is difficult to access from * outside the TIPC core, however, so a pointer to the lock has * been added in the port instance, -to be used for unlocking * only. * - A read/write lock to protect the reference table itself (teg.c). * (Nobody is using read-only access to this, so it can just as * well be changed to a spin_lock) * - A spin lock to protect the registry of kernel/driver users (reg.c) * - A global spin_lock (tipc_port_lock), which only task is to ensure * consistency where more than one port is involved in an operation, * i.e., when a port is part of a linked list of ports. * There are two such lists; 'port_list', which is used for management, * and 'wait_list', which is used to queue ports during congestion. * * 4: The name table (name_table.c, name_distr.c, subscription.c) * - There is one big read/write-lock (tipc_nametbl_lock) protecting the * overall name table structure. Nothing must be added/removed to * this structure without holding write access to it. * - There is one local spin_lock per sub_sequence, which can be seen * as a sub-domain to the tipc_nametbl_lock domain. It is used only * for translation operations, and is needed because a translation * steps the root of the 'publication' linked list between each lookup. * This is always used within the scope of a tipc_nametbl_lock(read). * - A local spin_lock protecting the queue of subscriber events. */ static void tipc_net_finalize(struct net *net, u32 addr); int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { if (tipc_own_id(net)) { pr_info("Cannot configure node identity twice\n"); return -1; } pr_info("Started in network mode\n"); if (node_id) tipc_set_node_id(net, node_id); if (addr) tipc_net_finalize(net, addr); return 0; } static void tipc_net_finalize(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_socket_addr sk = {0, addr}; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE, TIPC_NODE_STATE, addr, addr); if (cmpxchg(&tn->node_addr, 0, addr)) return; tipc_set_node_addr(net, addr); tipc_named_reinit(net); tipc_sk_reinit(net); tipc_mon_reinit_self(net); tipc_nametbl_publish(net, &ua, &sk, addr); } void tipc_net_finalize_work(struct work_struct *work) { struct tipc_net *tn = container_of(work, struct tipc_net, work); tipc_net_finalize(tipc_link_net(tn->bcl), tn->trial_addr); } void tipc_net_stop(struct net *net) { if (!tipc_own_id(net)) return; rtnl_lock(); tipc_bearer_stop(net); tipc_node_stop(net); rtnl_unlock(); pr_info("Left network mode\n"); } static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = net_generic(net, tipc_net_id); u64 *w0 = (u64 *)&tn->node_id[0]; u64 *w1 = (u64 *)&tn->node_id[8]; struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NET_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tn->net_id)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID, *w0, 0)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID_W1, *w1, 0)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int err; int done = cb->args[0]; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; err = __tipc_nl_add_net(net, &msg); if (err) goto out; done = 1; out: cb->args[0] = done; return skb->len; } int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); int err; if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* Can't change net id once TIPC has joined a network */ if (tipc_own_addr(net)) return -EPERM; if (attrs[TIPC_NLA_NET_ID]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_NET_ID]); if (val < 1 || val > 9999) return -EINVAL; tn->net_id = val; } if (attrs[TIPC_NLA_NET_ADDR]) { u32 addr; addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; tn->legacy_addr_format = true; tipc_net_init(net, NULL, addr); } if (attrs[TIPC_NLA_NET_NODEID]) { u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); tipc_net_init(net, node_id, 0); } return 0; } int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_net_set(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = tipc_net(net); struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_ADDR_LEGACY_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; if (tn->legacy_addr_format) if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; struct sk_buff *rep; int err; rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!rep) return -ENOMEM; msg.skb = rep; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_addr_legacy_get(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); } |
50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 | // SPDX-License-Identifier: GPL-2.0+ /* * linux/net/sunrpc/gss_rpc_upcall.c * * Copyright (C) 2012 Simo Sorce <simo@redhat.com> */ #include <linux/types.h> #include <linux/un.h> #include <linux/sunrpc/svcauth.h> #include "gss_rpc_upcall.h" #define GSSPROXY_SOCK_PATHNAME "/var/run/gssproxy.sock" #define GSSPROXY_PROGRAM (400112u) #define GSSPROXY_VERS_1 (1u) /* * Encoding/Decoding functions */ enum { GSSX_NULL = 0, /* Unused */ GSSX_INDICATE_MECHS = 1, GSSX_GET_CALL_CONTEXT = 2, GSSX_IMPORT_AND_CANON_NAME = 3, GSSX_EXPORT_CRED = 4, GSSX_IMPORT_CRED = 5, GSSX_ACQUIRE_CRED = 6, GSSX_STORE_CRED = 7, GSSX_INIT_SEC_CONTEXT = 8, GSSX_ACCEPT_SEC_CONTEXT = 9, GSSX_RELEASE_HANDLE = 10, GSSX_GET_MIC = 11, GSSX_VERIFY = 12, GSSX_WRAP = 13, GSSX_UNWRAP = 14, GSSX_WRAP_SIZE_LIMIT = 15, }; #define PROC(proc, name) \ [GSSX_##proc] = { \ .p_proc = GSSX_##proc, \ .p_encode = gssx_enc_##name, \ .p_decode = gssx_dec_##name, \ .p_arglen = GSSX_ARG_##name##_sz, \ .p_replen = GSSX_RES_##name##_sz, \ .p_statidx = GSSX_##proc, \ .p_name = #proc, \ } static const struct rpc_procinfo gssp_procedures[] = { PROC(INDICATE_MECHS, indicate_mechs), PROC(GET_CALL_CONTEXT, get_call_context), PROC(IMPORT_AND_CANON_NAME, import_and_canon_name), PROC(EXPORT_CRED, export_cred), PROC(IMPORT_CRED, import_cred), PROC(ACQUIRE_CRED, acquire_cred), PROC(STORE_CRED, store_cred), PROC(INIT_SEC_CONTEXT, init_sec_context), PROC(ACCEPT_SEC_CONTEXT, accept_sec_context), PROC(RELEASE_HANDLE, release_handle), PROC(GET_MIC, get_mic), PROC(VERIFY, verify), PROC(WRAP, wrap), PROC(UNWRAP, unwrap), PROC(WRAP_SIZE_LIMIT, wrap_size_limit), }; /* * Common transport functions */ static const struct rpc_program gssp_program; static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) { static const struct sockaddr_un gssp_localaddr = { .sun_family = AF_LOCAL, .sun_path = GSSPROXY_SOCK_PATHNAME, }; struct rpc_create_args args = { .net = net, .protocol = XPRT_TRANSPORT_LOCAL, .address = (struct sockaddr *)&gssp_localaddr, .addrsize = sizeof(gssp_localaddr), .servername = "localhost", .program = &gssp_program, .version = GSSPROXY_VERS_1, .authflavor = RPC_AUTH_NULL, /* * Note we want connection to be done in the caller's * filesystem namespace. We therefore turn off the idle * timeout, which would result in reconnections being * done without the correct namespace: */ .flags = RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_CONNECTED | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; int result = 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("RPC: failed to create AF_LOCAL gssproxy " "client (errno %ld).\n", PTR_ERR(clnt)); result = PTR_ERR(clnt); *_clnt = NULL; goto out; } dprintk("RPC: created new gssp local client (gssp_local_clnt: " "%p)\n", clnt); *_clnt = clnt; out: return result; } void init_gssp_clnt(struct sunrpc_net *sn) { mutex_init(&sn->gssp_lock); sn->gssp_clnt = NULL; } int set_gssp_clnt(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt; int ret; mutex_lock(&sn->gssp_lock); ret = gssp_rpc_create(net, &clnt); if (!ret) { if (sn->gssp_clnt) rpc_shutdown_client(sn->gssp_clnt); sn->gssp_clnt = clnt; } mutex_unlock(&sn->gssp_lock); return ret; } void clear_gssp_clnt(struct sunrpc_net *sn) { mutex_lock(&sn->gssp_lock); if (sn->gssp_clnt) { rpc_shutdown_client(sn->gssp_clnt); sn->gssp_clnt = NULL; } mutex_unlock(&sn->gssp_lock); } static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn) { struct rpc_clnt *clnt; mutex_lock(&sn->gssp_lock); clnt = sn->gssp_clnt; if (clnt) refcount_inc(&clnt->cl_count); mutex_unlock(&sn->gssp_lock); return clnt; } static int gssp_call(struct net *net, struct rpc_message *msg) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt; int status; clnt = get_gssp_clnt(sn); if (!clnt) return -EIO; status = rpc_call_sync(clnt, msg, 0); if (status < 0) { dprintk("gssp: rpc_call returned error %d\n", -status); switch (status) { case -EPROTONOSUPPORT: status = -EINVAL; break; case -ECONNREFUSED: case -ETIMEDOUT: case -ENOTCONN: status = -EAGAIN; break; case -ERESTARTSYS: if (signalled ()) status = -EINTR; break; default: break; } } rpc_release_client(clnt); return status; } static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) { unsigned int i; for (i = 0; i < arg->npages && arg->pages[i]; i++) __free_page(arg->pages[i]); kfree(arg->pages); } static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) { unsigned int i; arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); if (!arg->pages) return -ENOMEM; for (i = 0; i < arg->npages; i++) { arg->pages[i] = alloc_page(GFP_KERNEL); if (!arg->pages[i]) { gssp_free_receive_pages(arg); return -ENOMEM; } } return 0; } static char *gssp_stringify(struct xdr_netobj *netobj) { return kmemdup_nul(netobj->data, netobj->len, GFP_KERNEL); } static void gssp_hostbased_service(char **principal) { char *c; if (!*principal) return; /* terminate and remove realm part */ c = strchr(*principal, '@'); if (c) { *c = '\0'; /* change service-hostname delimiter */ c = strchr(*principal, '/'); if (c) *c = '@'; } if (!c) { /* not a service principal */ kfree(*principal); *principal = NULL; } } /* * Public functions */ /* numbers somewhat arbitrary but large enough for current needs */ #define GSSX_MAX_OUT_HANDLE 128 #define GSSX_MAX_SRC_PRINC 256 #define GSSX_KMEMBUF (GSSX_max_output_handle_sz + \ GSSX_max_oid_sz + \ GSSX_max_princ_sz + \ sizeof(struct svc_cred)) int gssp_accept_sec_context_upcall(struct net *net, struct gssp_upcall_data *data) { struct gssx_ctx ctxh = { .state = data->in_handle }; struct gssx_arg_accept_sec_context arg = { .input_token = data->in_token, }; struct gssx_ctx rctxh = { /* * pass in the max length we expect for each of these * buffers but let the xdr code kmalloc them: */ .exported_context_token.len = GSSX_max_output_handle_sz, .mech.len = GSS_OID_MAX_LEN, .targ_name.display_name.len = GSSX_max_princ_sz, .src_name.display_name.len = GSSX_max_princ_sz }; struct gssx_res_accept_sec_context res = { .context_handle = &rctxh, .output_token = &data->out_token }; struct rpc_message msg = { .rpc_proc = &gssp_procedures[GSSX_ACCEPT_SEC_CONTEXT], .rpc_argp = &arg, .rpc_resp = &res, .rpc_cred = NULL, /* FIXME ? */ }; struct xdr_netobj client_name = { 0 , NULL }; struct xdr_netobj target_name = { 0, NULL }; int ret; if (data->in_handle.len != 0) arg.context_handle = &ctxh; res.output_token->len = GSSX_max_output_token_sz; ret = gssp_alloc_receive_pages(&arg); if (ret) return ret; ret = gssp_call(net, &msg); gssp_free_receive_pages(&arg); /* we need to fetch all data even in case of error so * that we can free special strctures is they have been allocated */ data->major_status = res.status.major_status; data->minor_status = res.status.minor_status; if (res.context_handle) { data->out_handle = rctxh.exported_context_token; data->mech_oid.len = rctxh.mech.len; if (rctxh.mech.data) { memcpy(data->mech_oid.data, rctxh.mech.data, data->mech_oid.len); kfree(rctxh.mech.data); } client_name = rctxh.src_name.display_name; target_name = rctxh.targ_name.display_name; } if (res.options.count == 1) { gssx_buffer *value = &res.options.data[0].value; /* Currently we only decode CREDS_VALUE, if we add * anything else we'll have to loop and match on the * option name */ if (value->len == 1) { /* steal group info from struct svc_cred */ data->creds = *(struct svc_cred *)value->data; data->found_creds = 1; } /* whether we use it or not, free data */ kfree(value->data); } if (res.options.count != 0) { kfree(res.options.data); } /* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */ if (data->found_creds) { if (client_name.data) { data->creds.cr_raw_principal = gssp_stringify(&client_name); data->creds.cr_principal = gssp_stringify(&client_name); gssp_hostbased_service(&data->creds.cr_principal); } if (target_name.data) { data->creds.cr_targ_princ = gssp_stringify(&target_name); gssp_hostbased_service(&data->creds.cr_targ_princ); } } kfree(client_name.data); kfree(target_name.data); return ret; } void gssp_free_upcall_data(struct gssp_upcall_data *data) { kfree(data->in_handle.data); kfree(data->out_handle.data); kfree(data->out_token.data); free_svc_cred(&data->creds); } /* * Initialization stuff */ static unsigned int gssp_version1_counts[ARRAY_SIZE(gssp_procedures)]; static const struct rpc_version gssp_version1 = { .number = GSSPROXY_VERS_1, .nrprocs = ARRAY_SIZE(gssp_procedures), .procs = gssp_procedures, .counts = gssp_version1_counts, }; static const struct rpc_version *gssp_version[] = { NULL, &gssp_version1, }; static struct rpc_stat gssp_stats; static const struct rpc_program gssp_program = { .name = "gssproxy", .number = GSSPROXY_PROGRAM, .nrvers = ARRAY_SIZE(gssp_version), .version = gssp_version, .stats = &gssp_stats, }; |
67 20 20 529 355 19 158 76 1 72 344 9 2 28 31 31 16 27 19 50 354 28 103 28 2 16 108 3 1 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_NET_H #define _LINUX_VIRTIO_NET_H #include <linux/if_vlan.h> #include <linux/udp.h> #include <uapi/linux/tcp.h> #include <uapi/linux/virtio_net.h> static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) { switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: return protocol == cpu_to_be16(ETH_P_IP); case VIRTIO_NET_HDR_GSO_TCPV6: return protocol == cpu_to_be16(ETH_P_IPV6); case VIRTIO_NET_HDR_GSO_UDP: return protocol == cpu_to_be16(ETH_P_IP) || protocol == cpu_to_be16(ETH_P_IPV6); default: return false; } } static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { if (skb->protocol) return 0; switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: skb->protocol = cpu_to_be16(ETH_P_IP); break; case VIRTIO_NET_HDR_GSO_TCPV6: skb->protocol = cpu_to_be16(ETH_P_IPV6); break; default: return -EINVAL; } return 0; } static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) { unsigned int gso_type = 0; unsigned int thlen = 0; unsigned int p_off = 0; unsigned int ip_proto; if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: gso_type = SKB_GSO_TCPV4; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); break; case VIRTIO_NET_HDR_GSO_TCPV6: gso_type = SKB_GSO_TCPV6; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); break; case VIRTIO_NET_HDR_GSO_UDP: gso_type = SKB_GSO_UDP; ip_proto = IPPROTO_UDP; thlen = sizeof(struct udphdr); break; default: return -EINVAL; } if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) gso_type |= SKB_GSO_TCP_ECN; if (hdr->gso_size == 0) return -EINVAL; } skb_reset_mac_header(skb); if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start); u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset); u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16)); if (!pskb_may_pull(skb, needed)) return -EINVAL; if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; p_off = skb_transport_offset(skb) + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } else { /* gso packets without NEEDS_CSUM do not set transport_offset. * probe and drop if does not match one of the above types. */ if (gso_type && skb->network_header) { struct flow_keys_basic keys; if (!skb->protocol) { __be16 protocol = dev_parse_header_protocol(skb); if (!protocol) virtio_net_hdr_set_proto(skb, hdr); else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type)) return -EINVAL; else skb->protocol = protocol; } retry: if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) { /* UFO does not specify ipv4 or 6: try both */ if (gso_type & SKB_GSO_UDP && skb->protocol == htons(ETH_P_IP)) { skb->protocol = htons(ETH_P_IPV6); goto retry; } return -EINVAL; } p_off = keys.control.thoff + thlen; if (!pskb_may_pull(skb, p_off) || keys.basic.ip_proto != ip_proto) return -EINVAL; skb_set_transport_header(skb, keys.control.thoff); } else if (gso_type) { p_off = thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } } if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); unsigned int nh_off = p_off; struct skb_shared_info *shinfo = skb_shinfo(skb); switch (gso_type & ~SKB_GSO_TCP_ECN) { case SKB_GSO_UDP: /* UFO may not include transport header in gso_size. */ nh_off -= thlen; break; case SKB_GSO_UDP_L4: if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) return -EINVAL; if (skb->csum_offset != offsetof(struct udphdr, check)) return -EINVAL; if (skb->len - p_off > gso_size * UDP_MAX_SEGMENTS) return -EINVAL; if (gso_type != SKB_GSO_UDP_L4) return -EINVAL; break; case SKB_GSO_TCPV4: case SKB_GSO_TCPV6: if (skb->ip_summed == CHECKSUM_PARTIAL && skb->csum_offset != offsetof(struct tcphdr, check)) return -EINVAL; break; } /* Kernel has a special handling for GSO_BY_FRAGS. */ if (gso_size == GSO_BY_FRAGS) return -EINVAL; /* Too small packets are not really GSO ones. */ if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; shinfo->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } } return 0; } static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, bool has_data_valid, int vlan_hlen) { memset(hdr, 0, sizeof(*hdr)); /* no info leak */ if (skb_is_gso(skb)) { struct skb_shared_info *sinfo = skb_shinfo(skb); /* This is a hint as to how much should be linear. */ hdr->hdr_len = __cpu_to_virtio16(little_endian, skb_headlen(skb)); hdr->gso_size = __cpu_to_virtio16(little_endian, sinfo->gso_size); if (sinfo->gso_type & SKB_GSO_TCPV4) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; else return -EINVAL; if (sinfo->gso_type & SKB_GSO_TCP_ECN) hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } else hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; if (skb->ip_summed == CHECKSUM_PARTIAL) { hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; hdr->csum_start = __cpu_to_virtio16(little_endian, skb_checksum_start_offset(skb) + vlan_hlen); hdr->csum_offset = __cpu_to_virtio16(little_endian, skb->csum_offset); } else if (has_data_valid && skb->ip_summed == CHECKSUM_UNNECESSARY) { hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ return 0; } #endif /* _LINUX_VIRTIO_NET_H */ |
50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/lockd/svc.c * * This is the central lockd service. * * FIXME: Separate the lockd NFS server functionality from the lockd NFS * client functionality. Oh why didn't Sun create two separate * services in the first place? * * Authors: Olaf Kirch (okir@monad.swb.de) * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/moduleparam.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/uio.h> #include <linux/smp.h> #include <linux/mutex.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/inetdevice.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svc_xprt.h> #include <net/ip.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <linux/lockd/lockd.h> #include <linux/nfs.h> #include "netns.h" #include "procfs.h" #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) #define ALLOWED_SIGS (sigmask(SIGKILL)) static struct svc_program nlmsvc_program; const struct nlmsvc_binding *nlmsvc_ops; EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct svc_serv *nlmsvc_serv; unsigned long nlmsvc_timeout; unsigned int lockd_net_id; /* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 */ static unsigned long nlm_grace_period; static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; static int nlm_udpport, nlm_tcpport; /* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */ static unsigned int nlm_max_connections = 1024; /* * Constants needed for the sysctl interface. */ static const unsigned long nlm_grace_period_min = 0; static const unsigned long nlm_grace_period_max = 240; static const unsigned long nlm_timeout_min = 3; static const unsigned long nlm_timeout_max = 20; static const int nlm_port_min = 0, nlm_port_max = 65535; #ifdef CONFIG_SYSCTL static struct ctl_table_header * nlm_sysctl_table; #endif static unsigned long get_lockd_grace_period(void) { /* Note: nlm_timeout should always be nonzero */ if (nlm_grace_period) return roundup(nlm_grace_period, nlm_timeout) * HZ; else return nlm_timeout * 5 * HZ; } static void grace_ender(struct work_struct *grace) { struct delayed_work *dwork = to_delayed_work(grace); struct lockd_net *ln = container_of(dwork, struct lockd_net, grace_period_end); locks_end_grace(&ln->lockd_manager); } static void set_grace_period(struct net *net) { unsigned long grace_period = get_lockd_grace_period(); struct lockd_net *ln = net_generic(net, lockd_net_id); locks_start_grace(net, &ln->lockd_manager); cancel_delayed_work_sync(&ln->grace_period_end); schedule_delayed_work(&ln->grace_period_end, grace_period); } static void restart_grace(void) { if (nlmsvc_ops) { struct net *net = &init_net; struct lockd_net *ln = net_generic(net, lockd_net_id); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); nlmsvc_invalidate_all(); set_grace_period(net); } } /* * This is the lockd kernel thread */ static int lockd(void *vrqstp) { int err = 0; struct svc_rqst *rqstp = vrqstp; struct net *net = &init_net; struct lockd_net *ln = net_generic(net, lockd_net_id); /* try_to_freeze() is called from svc_recv() */ set_freezable(); /* Allow SIGKILL to tell lockd to drop all of its locks */ allow_signal(SIGKILL); dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); /* * The main request loop. We don't terminate until the last * NFS mount or NFS daemon has gone away. */ while (!kthread_should_stop()) { long timeout = MAX_SCHEDULE_TIMEOUT; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); /* update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nlm_max_connections; if (signalled()) { flush_signals(current); restart_grace(); continue; } timeout = nlmsvc_retry_blocked(); /* * Find a socket with data available and call its * recvfrom routine. */ err = svc_recv(rqstp, timeout); if (err == -EAGAIN || err == -EINTR) continue; dprintk("lockd: request from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); svc_process(rqstp); } flush_signals(current); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); dprintk("lockd_down: service stopped\n"); svc_exit_thread(rqstp); return 0; } static int create_lockd_listener(struct svc_serv *serv, const char *name, struct net *net, const int family, const unsigned short port, const struct cred *cred) { struct svc_xprt *xprt; xprt = svc_find_xprt(serv, name, net, family, 0); if (xprt == NULL) return svc_xprt_create(serv, name, net, family, port, SVC_SOCK_DEFAULTS, cred); svc_xprt_put(xprt); return 0; } static int create_lockd_family(struct svc_serv *serv, struct net *net, const int family, const struct cred *cred) { int err; err = create_lockd_listener(serv, "udp", net, family, nlm_udpport, cred); if (err < 0) return err; return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport, cred); } /* * Ensure there are active UDP and TCP listeners for lockd. * * Even if we have only TCP NFS mounts and/or TCP NFSDs, some * local services (such as rpc.statd) still require UDP, and * some NFS servers do not yet support NLM over TCP. * * Returns zero if all listeners are available; otherwise a * negative errno value is returned. */ static int make_socks(struct svc_serv *serv, struct net *net, const struct cred *cred) { static int warned; int err; err = create_lockd_family(serv, net, PF_INET, cred); if (err < 0) goto out_err; err = create_lockd_family(serv, net, PF_INET6, cred); if (err < 0 && err != -EAFNOSUPPORT) goto out_err; warned = 0; return 0; out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); svc_xprt_destroy_all(serv, net); svc_rpcb_cleanup(serv, net); return err; } static int lockd_up_net(struct svc_serv *serv, struct net *net, const struct cred *cred) { struct lockd_net *ln = net_generic(net, lockd_net_id); int error; if (ln->nlmsvc_users++) return 0; error = svc_bind(serv, net); if (error) goto err_bind; error = make_socks(serv, net, cred); if (error < 0) goto err_bind; set_grace_period(net); dprintk("%s: per-net data created; net=%x\n", __func__, net->ns.inum); return 0; err_bind: ln->nlmsvc_users--; return error; } static void lockd_down_net(struct svc_serv *serv, struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); svc_xprt_destroy_all(serv, net); svc_rpcb_cleanup(serv, net); } } else { pr_err("%s: no users! net=%x\n", __func__, net->ns.inum); BUG(); } } static int lockd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin; if (event != NETDEV_DOWN) goto out; if (nlmsvc_serv) { dprintk("lockd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin); } out: return NOTIFY_DONE; } static struct notifier_block lockd_inetaddr_notifier = { .notifier_call = lockd_inetaddr_event, }; #if IS_ENABLED(CONFIG_IPV6) static int lockd_inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6; if (event != NETDEV_DOWN) goto out; if (nlmsvc_serv) { dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin6); } out: return NOTIFY_DONE; } static struct notifier_block lockd_inet6addr_notifier = { .notifier_call = lockd_inet6addr_event, }; #endif static int lockd_get(void) { struct svc_serv *serv; int error; if (nlmsvc_serv) { nlmsvc_users++; return 0; } /* * Sanity check: if there's no pid, * we should be the first user ... */ if (nlmsvc_users) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); if (!nlm_timeout) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return -ENOMEM; } serv->sv_maxconn = nlm_max_connections; error = svc_set_num_threads(serv, NULL, 1); /* The thread now holds the only reference */ svc_put(serv); if (error < 0) return error; nlmsvc_serv = serv; register_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&lockd_inet6addr_notifier); #endif dprintk("lockd_up: service created\n"); nlmsvc_users++; return 0; } static void lockd_put(void) { if (WARN(nlmsvc_users <= 0, "lockd_down: no users!\n")) return; if (--nlmsvc_users) return; unregister_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif svc_set_num_threads(nlmsvc_serv, NULL, 0); nlmsvc_serv = NULL; dprintk("lockd_down: service destroyed\n"); } /* * Bring up the lockd process if it's not already up. */ int lockd_up(struct net *net, const struct cred *cred) { int error; mutex_lock(&nlmsvc_mutex); error = lockd_get(); if (error) goto err; error = lockd_up_net(nlmsvc_serv, net, cred); if (error < 0) { lockd_put(); goto err; } err: mutex_unlock(&nlmsvc_mutex); return error; } EXPORT_SYMBOL_GPL(lockd_up); /* * Decrement the user count and bring down lockd if we're the last. */ void lockd_down(struct net *net) { mutex_lock(&nlmsvc_mutex); lockd_down_net(nlmsvc_serv, net); lockd_put(); mutex_unlock(&nlmsvc_mutex); } EXPORT_SYMBOL_GPL(lockd_down); #ifdef CONFIG_SYSCTL /* * Sysctl parameters (same as module parameters, different interface). */ static struct ctl_table nlm_sysctls[] = { { .procname = "nlm_grace_period", .data = &nlm_grace_period, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = (unsigned long *) &nlm_grace_period_min, .extra2 = (unsigned long *) &nlm_grace_period_max, }, { .procname = "nlm_timeout", .data = &nlm_timeout, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = (unsigned long *) &nlm_timeout_min, .extra2 = (unsigned long *) &nlm_timeout_max, }, { .procname = "nlm_udpport", .data = &nlm_udpport, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (int *) &nlm_port_min, .extra2 = (int *) &nlm_port_max, }, { .procname = "nlm_tcpport", .data = &nlm_tcpport, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (int *) &nlm_port_min, .extra2 = (int *) &nlm_port_max, }, { .procname = "nsm_use_hostnames", .data = &nsm_use_hostnames, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dobool, }, { .procname = "nsm_local_state", .data = &nsm_local_state, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; static struct ctl_table nlm_sysctl_dir[] = { { .procname = "nfs", .mode = 0555, .child = nlm_sysctls, }, { } }; static struct ctl_table nlm_sysctl_root[] = { { .procname = "fs", .mode = 0555, .child = nlm_sysctl_dir, }, { } }; #endif /* CONFIG_SYSCTL */ /* * Module (and sysfs) parameters. */ #define param_set_min_max(name, type, which_strtol, min, max) \ static int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ char *endp; \ __typeof__(type) num = which_strtol(val, &endp, 0); \ if (endp == val || *endp || num < (min) || num > (max)) \ return -EINVAL; \ *((type *) kp->arg) = num; \ return 0; \ } static inline int is_callback(u32 proc) { return proc == NLMPROC_GRANTED || proc == NLMPROC_GRANTED_MSG || proc == NLMPROC_TEST_RES || proc == NLMPROC_LOCK_RES || proc == NLMPROC_CANCEL_RES || proc == NLMPROC_UNLOCK_RES || proc == NLMPROC_NSM_NOTIFY; } static int lockd_authenticate(struct svc_rqst *rqstp) { rqstp->rq_client = NULL; switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: case RPC_AUTH_UNIX: rqstp->rq_auth_stat = rpc_auth_ok; if (rqstp->rq_proc == 0) return SVC_OK; if (is_callback(rqstp->rq_proc)) { /* Leave it to individual procedures to * call nlmsvc_lookup_host(rqstp) */ return SVC_OK; } return svc_set_client(rqstp); } rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } param_set_min_max(port, int, simple_strtol, 0, 65535) param_set_min_max(grace_period, unsigned long, simple_strtoul, nlm_grace_period_min, nlm_grace_period_max) param_set_min_max(timeout, unsigned long, simple_strtoul, nlm_timeout_min, nlm_timeout_max) MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_DESCRIPTION("NFS file locking service version " LOCKD_VERSION "."); MODULE_LICENSE("GPL"); module_param_call(nlm_grace_period, param_set_grace_period, param_get_ulong, &nlm_grace_period, 0644); module_param_call(nlm_timeout, param_set_timeout, param_get_ulong, &nlm_timeout, 0644); module_param_call(nlm_udpport, param_set_port, param_get_int, &nlm_udpport, 0644); module_param_call(nlm_tcpport, param_set_port, param_get_int, &nlm_tcpport, 0644); module_param(nsm_use_hostnames, bool, 0644); module_param(nlm_max_connections, uint, 0644); static int lockd_init_net(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); INIT_LIST_HEAD(&ln->lockd_manager.list); ln->lockd_manager.block_opens = false; INIT_LIST_HEAD(&ln->nsm_handles); return 0; } static void lockd_exit_net(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); WARN_ONCE(!list_empty(&ln->lockd_manager.list), "net %x %s: lockd_manager.list is not empty\n", net->ns.inum, __func__); WARN_ONCE(!list_empty(&ln->nsm_handles), "net %x %s: nsm_handles list is not empty\n", net->ns.inum, __func__); WARN_ONCE(delayed_work_pending(&ln->grace_period_end), "net %x %s: grace_period_end was not cancelled\n", net->ns.inum, __func__); } static struct pernet_operations lockd_net_ops = { .init = lockd_init_net, .exit = lockd_exit_net, .id = &lockd_net_id, .size = sizeof(struct lockd_net), }; /* * Initialising and terminating the module. */ static int __init init_nlm(void) { int err; #ifdef CONFIG_SYSCTL err = -ENOMEM; nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); if (nlm_sysctl_table == NULL) goto err_sysctl; #endif err = register_pernet_subsys(&lockd_net_ops); if (err) goto err_pernet; err = lockd_create_procfs(); if (err) goto err_procfs; return 0; err_procfs: unregister_pernet_subsys(&lockd_net_ops); err_pernet: #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); err_sysctl: #endif return err; } static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); lockd_remove_procfs(); unregister_pernet_subsys(&lockd_net_ops); #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); #endif } module_init(init_nlm); module_exit(exit_nlm); /** * nlmsvc_dispatch - Process an NLM Request * @rqstp: incoming request * @statp: pointer to location of accept_stat field in RPC Reply buffer * * Return values: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp) { const struct svc_procedure *procp = rqstp->rq_procinfo; svcxdr_init_decode(rqstp); if (!procp->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; *statp = procp->pc_func(rqstp); if (*statp == rpc_drop_reply) return 0; if (*statp != rpc_success) return 1; svcxdr_init_encode(rqstp); if (!procp->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; return 1; out_decode_err: *statp = rpc_garbage_args; return 1; out_encode_err: *statp = rpc_system_err; return 1; } /* * Define NLM program and procedures */ static unsigned int nlmsvc_version1_count[17]; static const struct svc_version nlmsvc_version1 = { .vs_vers = 1, .vs_nproc = 17, .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version1_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; static unsigned int nlmsvc_version3_count[24]; static const struct svc_version nlmsvc_version3 = { .vs_vers = 3, .vs_nproc = 24, .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version3_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #ifdef CONFIG_LOCKD_V4 static unsigned int nlmsvc_version4_count[24]; static const struct svc_version nlmsvc_version4 = { .vs_vers = 4, .vs_nproc = 24, .vs_proc = nlmsvc_procedures4, .vs_count = nlmsvc_version4_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #endif static const struct svc_version *nlmsvc_version[] = { [1] = &nlmsvc_version1, [3] = &nlmsvc_version3, #ifdef CONFIG_LOCKD_V4 [4] = &nlmsvc_version4, #endif }; #define NLM_NRVERS ARRAY_SIZE(nlmsvc_version) static struct svc_program nlmsvc_program = { .pg_prog = NLM_PROGRAM, /* program number */ .pg_nvers = NLM_NRVERS, /* number of entries in nlmsvc_version */ .pg_vers = nlmsvc_version, /* version table */ .pg_name = "lockd", /* service name */ .pg_class = "nfsd", /* share authentication with nfsd */ .pg_authenticate = &lockd_authenticate, /* export authentication */ .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, }; |
50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * This file implements the various access functions for the * PROC file system. It is mainly used for debugging and * statistics. * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de> * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de> * * Fixes: * Alan Cox : UDP sockets show the rxqueue/txqueue * using hint flag for the netinfo. * Pauline Middelink : identd support * Alan Cox : Make /proc safer. * Erik Schoenfelder : /proc/net/snmp * Alan Cox : Handle dead sockets properly. * Gerhard Koerting : Show both timers * Alan Cox : Allow inode to be NULL (kernel socket) * Andi Kleen : Add support for open_requests and * split functions for more readibility. * Andi Kleen : Add support for /proc/net/netstat * Arnaldo C. Melo : Convert to seq_file */ #include <linux/types.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/mptcp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/bottom_half.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #include <net/sock.h> #include <net/raw.h> #define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX) /* * Report socket allocation statistics [mea@utu.fi] */ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; int orphans, sockets; orphans = tcp_orphan_count_sum(); sockets = proto_sockets_allocated_sum_positive(&tcp_prot); socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), proto_memory_allocated(&udp_prot)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %u memory %lu\n", atomic_read(&net->ipv4.fqdir->rhashtable.nelems), frag_mem_limit(net->ipv4.fqdir)); return 0; } /* snmp items */ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS), SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS), SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS), SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS), SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS), SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS), SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_SENTINEL }; /* Following items are displayed in /proc/net/netstat */ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS), SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS), SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS), SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS), SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS), SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* Non RFC4293 fields */ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), SNMP_MIB_SENTINEL }; static const struct { const char *name; int index; } icmpmibmap[] = { { "DestUnreachs", ICMP_DEST_UNREACH }, { "TimeExcds", ICMP_TIME_EXCEEDED }, { "ParmProbs", ICMP_PARAMETERPROB }, { "SrcQuenchs", ICMP_SOURCE_QUENCH }, { "Redirects", ICMP_REDIRECT }, { "Echos", ICMP_ECHO }, { "EchoReps", ICMP_ECHOREPLY }, { "Timestamps", ICMP_TIMESTAMP }, { "TimestampReps", ICMP_TIMESTAMPREPLY }, { "AddrMasks", ICMP_ADDRESS }, { "AddrMaskReps", ICMP_ADDRESSREPLY }, { NULL, 0 } }; static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN), SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS), SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS), SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS), SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS), SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB), SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS), SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS), SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS), SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED), SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED), SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED), SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS), SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS), SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER), SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO), SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT), SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), SNMP_MIB_ITEM("TCPMemoryPressuresChrono", LINUX_MIB_TCPMEMORYPRESSURESCHRONO), SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE), SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP), SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT), SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV), SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS), SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ), SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), SNMP_MIB_SENTINEL }; static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, unsigned short *type, int count) { int j; if (count) { seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %sType%u", type[j] & 0x100 ? "Out" : "In", type[j] & 0xff); seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", vals[j]); } } static void icmpmsg_put(struct seq_file *seq) { #define PERLINE 16 int i, count; unsigned short type[PERLINE]; unsigned long vals[PERLINE], val; struct net *net = seq->private; count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]); if (val) { type[count] = i; vals[count++] = val; } if (count == PERLINE) { icmpmsg_put_line(seq, vals, type, count); count = 0; } } icmpmsg_put_line(seq, vals, type, count); #undef PERLINE } static void icmp_put(struct seq_file *seq) { int i; struct net *net = seq->private; atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_puts(seq, " OutMsgs OutErrors"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } /* * Called from the PROCfs module. This outputs /proc/net/snmp. */ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) { struct net *net = seq->private; u64 buff64[IPSTATS_MIB_MAX]; int i; memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64)); seq_puts(seq, "Ip: Forwarding DefaultTTL"); for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %llu", buff64[i]); return 0; } static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) { unsigned long buff[TCPUDP_MIB_MAX]; struct net *net = seq->private; int i; memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); for (i = 0; snmp4_tcp_list[i].name; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); snmp_get_cpu_field_batch(buff, snmp4_tcp_list, net->mib.tcp_statistics); for (i = 0; snmp4_tcp_list[i].name; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", buff[i]); else seq_printf(seq, " %lu", buff[i]); } memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); snmp_get_cpu_field_batch(buff, snmp4_udp_list, net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %lu", buff[i]); memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); snmp_get_cpu_field_batch(buff, snmp4_udp_list, net->mib.udplite_statistics); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); return 0; } static int snmp_seq_show(struct seq_file *seq, void *v) { snmp_seq_show_ipstats(seq, v); icmp_put(seq); /* RFC 2011 compatibility */ icmpmsg_put(seq); snmp_seq_show_tcp_udp(seq, v); return 0; } /* * Output /proc/net/netstat */ static int netstat_seq_show(struct seq_file *seq, void *v) { const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1; const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1; struct net *net = seq->private; unsigned long *buff; int i; seq_puts(seq, "TcpExt:"); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_net_list[i].name); seq_puts(seq, "\nTcpExt:"); buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)), GFP_KERNEL); if (buff) { snmp_get_cpu_field_batch(buff, snmp4_net_list, net->mib.net_statistics); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", buff[i]); } else { for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); } seq_puts(seq, "\nIpExt:"); for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); seq_puts(seq, "\nIpExt:"); if (buff) { u64 *buff64 = (u64 *)buff; memset(buff64, 0, ip_cnt * sizeof(u64)); snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %llu", buff64[i]); } else { for (i = 0; i < ip_cnt; i++) seq_printf(seq, " %llu", snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, offsetof(struct ipstats_mib, syncp))); } kfree(buff); seq_putc(seq, '\n'); mptcp_seq_show(seq); return 0; } static __net_init int ip_proc_init_net(struct net *net) { if (!proc_create_net_single("sockstat", 0444, net->proc_net, sockstat_seq_show, NULL)) goto out_sockstat; if (!proc_create_net_single("netstat", 0444, net->proc_net, netstat_seq_show, NULL)) goto out_netstat; if (!proc_create_net_single("snmp", 0444, net->proc_net, snmp_seq_show, NULL)) goto out_snmp; return 0; out_snmp: remove_proc_entry("netstat", net->proc_net); out_netstat: remove_proc_entry("sockstat", net->proc_net); out_sockstat: return -ENOMEM; } static __net_exit void ip_proc_exit_net(struct net *net) { remove_proc_entry("snmp", net->proc_net); remove_proc_entry("netstat", net->proc_net); remove_proc_entry("sockstat", net->proc_net); } static __net_initdata struct pernet_operations ip_proc_ops = { .init = ip_proc_init_net, .exit = ip_proc_exit_net, }; int __init ip_misc_proc_init(void) { return register_pernet_subsys(&ip_proc_ops); } |
4 4 69 69 8 8 8 6 2 8 6 2 2 2 1 5 5 5 4 12 3 4 2 1 9 2 7 1 30 8 2 1 8 8 1 2 93 54 40 14 14 1 13 13 13 2 2 2 2 7 7 7 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 | // SPDX-License-Identifier: GPL-2.0-only /* * net/dccp/proto.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/random.h> #include <linux/slab.h> #include <net/checksum.h> #include <net/inet_sock.h> #include <net/inet_common.h> #include <net/sock.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/delay.h> #include <linux/poll.h> #include "ccid.h" #include "dccp.h" #include "feat.h" #define CREATE_TRACE_POINTS #include "trace.h" DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; EXPORT_SYMBOL_GPL(dccp_statistics); DEFINE_PER_CPU(unsigned int, dccp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); struct inet_hashinfo dccp_hashinfo; EXPORT_SYMBOL_GPL(dccp_hashinfo); /* the maximum queue length for tx in packets. 0 is no limit */ int sysctl_dccp_tx_qlen __read_mostly = 5; #ifdef CONFIG_IP_DCCP_DEBUG static const char *dccp_state_name(const int state) { static const char *const dccp_state_names[] = { [DCCP_OPEN] = "OPEN", [DCCP_REQUESTING] = "REQUESTING", [DCCP_PARTOPEN] = "PARTOPEN", [DCCP_LISTEN] = "LISTEN", [DCCP_RESPOND] = "RESPOND", [DCCP_CLOSING] = "CLOSING", [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", [DCCP_TIME_WAIT] = "TIME_WAIT", [DCCP_CLOSED] = "CLOSED", }; if (state >= DCCP_MAX_STATES) return "INVALID STATE!"; else return dccp_state_names[state]; } #endif void dccp_set_state(struct sock *sk, const int state) { const int oldstate = sk->sk_state; dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, dccp_state_name(oldstate), dccp_state_name(state)); WARN_ON(state == oldstate); switch (state) { case DCCP_OPEN: if (oldstate != DCCP_OPEN) DCCP_INC_STATS(DCCP_MIB_CURRESTAB); /* Client retransmits all Confirm options until entering OPEN */ if (oldstate == DCCP_PARTOPEN) dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); break; case DCCP_CLOSED: if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || oldstate == DCCP_CLOSING) DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash != NULL && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == DCCP_OPEN) DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_set_state(sk, state); } EXPORT_SYMBOL_GPL(dccp_set_state); static void dccp_finish_passive_close(struct sock *sk) { switch (sk->sk_state) { case DCCP_PASSIVE_CLOSE: /* Node (client or server) has received Close packet. */ dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); dccp_set_state(sk, DCCP_CLOSED); break; case DCCP_PASSIVE_CLOSEREQ: /* * Client received CloseReq. We set the `active' flag so that * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. */ dccp_send_close(sk, 1); dccp_set_state(sk, DCCP_CLOSING); } } void dccp_done(struct sock *sk) { dccp_set_state(sk, DCCP_CLOSED); dccp_clear_xmit_timers(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(dccp_done); const char *dccp_packet_name(const int type) { static const char *const dccp_packet_names[] = { [DCCP_PKT_REQUEST] = "REQUEST", [DCCP_PKT_RESPONSE] = "RESPONSE", [DCCP_PKT_DATA] = "DATA", [DCCP_PKT_ACK] = "ACK", [DCCP_PKT_DATAACK] = "DATAACK", [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", [DCCP_PKT_CLOSE] = "CLOSE", [DCCP_PKT_RESET] = "RESET", [DCCP_PKT_SYNC] = "SYNC", [DCCP_PKT_SYNCACK] = "SYNCACK", }; if (type >= DCCP_NR_PKT_TYPES) return "INVALID"; else return dccp_packet_names[type]; } EXPORT_SYMBOL_GPL(dccp_packet_name); void dccp_destruct_common(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_tx_ccid = NULL; } EXPORT_SYMBOL_GPL(dccp_destruct_common); static void dccp_sk_destruct(struct sock *sk) { dccp_destruct_common(sk); inet_sock_destruct(sk); } int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_rto = DCCP_TIMEOUT_INIT; icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; sk->sk_destruct = dccp_sk_destruct; icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; dccp_init_xmit_timers(sk); INIT_LIST_HEAD(&dp->dccps_featneg); /* control socket doesn't need feat nego */ if (likely(ctl_sock_initialized)) return dccp_feat_init(sk); return 0; } EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; } /* Clean up a referenced DCCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash != NULL) inet_put_port(sk); kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; if (dp->dccps_hc_rx_ackvec != NULL) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = NULL; /* clean up feature negotiation state */ dccp_feat_list_purge(&dp->dccps_featneg); } EXPORT_SYMBOL_GPL(dccp_destroy_sock); static inline int dccp_listen_start(struct sock *sk, int backlog) { struct dccp_sock *dp = dccp_sk(sk); dp->dccps_role = DCCP_ROLE_LISTEN; /* do not start to listen if feature negotiation setup fails */ if (dccp_feat_finalise_settings(dp)) return -EPROTO; return inet_csk_listen_start(sk, backlog); } static inline int dccp_need_reset(int state) { return state != DCCP_CLOSED && state != DCCP_LISTEN && state != DCCP_REQUESTING; } int dccp_disconnect(struct sock *sk, int flags) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); const int old_state = sk->sk_state; if (old_state != DCCP_CLOSED) dccp_set_state(sk, DCCP_CLOSED); /* * This corresponds to the ABORT function of RFC793, sec. 3.8 * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". */ if (old_state == DCCP_LISTEN) { inet_csk_listen_stop(sk); } else if (dccp_need_reset(old_state)) { dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); sk->sk_err = ECONNRESET; } else if (old_state == DCCP_REQUESTING) sk->sk_err = ECONNRESET; dccp_clear_xmit_timers(sk); ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = NULL; __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { __kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; } inet->inet_dport = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); icsk->icsk_backoff = 0; inet_csk_delack_init(sk); __sk_dst_reset(sk); WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk_error_report(sk); return 0; } EXPORT_SYMBOL_GPL(dccp_disconnect); /* * Wait for a DCCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; u8 shutdown; int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == DCCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events by poll logic and correct handling of state changes made by another threads is impossible in any case. */ mask = 0; if (READ_ONCE(sk->sk_err)) mask = EPOLLERR; shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected? */ if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { if (atomic_read(&sk->sk_rmem_alloc) > 0) mask |= EPOLLIN | EPOLLRDNORM; if (!(shutdown & SEND_SHUTDOWN)) { if (sk_stream_is_writeable(sk)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. */ if (sk_stream_is_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM; } } } return mask; } EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { int rc = -ENOTCONN; lock_sock(sk); if (sk->sk_state == DCCP_LISTEN) goto out; switch (cmd) { case SIOCOUTQ: { int amount = sk_wmem_alloc_get(sk); /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and * always 0, comparably to UDP. */ rc = put_user(amount, (int __user *)arg); } break; case SIOCINQ: { struct sk_buff *skb; unsigned long amount = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* * We will only return the amount of this packet since * that is all that will be read. */ amount = skb->len; } rc = put_user(amount, (int __user *)arg); } break; default: rc = -ENOIOCTLCMD; break; } out: release_sock(sk); return rc; } EXPORT_SYMBOL_GPL(dccp_ioctl); static int dccp_setsockopt_service(struct sock *sk, const __be32 service, sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_service_list *sl = NULL; if (service == DCCP_SERVICE_INVALID_VALUE || optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) return -EINVAL; if (optlen > sizeof(service)) { sl = kmalloc(optlen, GFP_KERNEL); if (sl == NULL) return -ENOMEM; sl->dccpsl_nr = optlen / sizeof(u32) - 1; if (copy_from_sockptr_offset(sl->dccpsl_list, optval, sizeof(service), optlen - sizeof(service)) || dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { kfree(sl); return -EFAULT; } } lock_sock(sk); dp->dccps_service = service; kfree(dp->dccps_service_list); dp->dccps_service_list = sl; release_sock(sk); return 0; } static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) { u8 *list, len; int i, rc; if (cscov < 0 || cscov > 15) return -EINVAL; /* * Populate a list of permissible values, in the range cscov...15. This * is necessary since feature negotiation of single values only works if * both sides incidentally choose the same value. Since the list starts * lowest-value first, negotiation will pick the smallest shared value. */ if (cscov == 0) return 0; len = 16 - cscov; list = kmalloc(len, GFP_KERNEL); if (list == NULL) return -ENOBUFS; for (i = 0; i < len; i++) list[i] = cscov++; rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); if (rc == 0) { if (rx) dccp_sk(sk)->dccps_pcrlen = cscov; else dccp_sk(sk)->dccps_pcslen = cscov; } kfree(list); return rc; } static int dccp_setsockopt_ccid(struct sock *sk, int type, sockptr_t optval, unsigned int optlen) { u8 *val; int rc = 0; if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) return -EINVAL; val = memdup_sockptr(optval, optlen); if (IS_ERR(val)) return PTR_ERR(val); lock_sock(sk); if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); release_sock(sk); kfree(val); return rc; } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; switch (optname) { case DCCP_SOCKOPT_PACKET_SIZE: DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_CHANGE_L: case DCCP_SOCKOPT_CHANGE_R: DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_CCID: case DCCP_SOCKOPT_RX_CCID: case DCCP_SOCKOPT_TX_CCID: return dccp_setsockopt_ccid(sk, optname, optval, optlen); } if (optlen < (int)sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; if (optname == DCCP_SOCKOPT_SERVICE) return dccp_setsockopt_service(sk, val, optval, optlen); lock_sock(sk); switch (optname) { case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) err = -EOPNOTSUPP; else dp->dccps_server_timewait = (val != 0); break; case DCCP_SOCKOPT_SEND_CSCOV: err = dccp_setsockopt_cscov(sk, val, false); break; case DCCP_SOCKOPT_RECV_CSCOV: err = dccp_setsockopt_cscov(sk, val, true); break; case DCCP_SOCKOPT_QPOLICY_ID: if (sk->sk_state != DCCP_CLOSED) err = -EISCONN; else if (val < 0 || val >= DCCPQ_POLICY_MAX) err = -EINVAL; else dp->dccps_qpolicy = val; break; case DCCP_SOCKOPT_QPOLICY_TXQLEN: if (val < 0) err = -EINVAL; else dp->dccps_tx_qlen = val; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, optname, optval, optlen); return do_dccp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL_GPL(dccp_setsockopt); static int dccp_getsockopt_service(struct sock *sk, int len, __be32 __user *optval, int __user *optlen) { const struct dccp_sock *dp = dccp_sk(sk); const struct dccp_service_list *sl; int err = -ENOENT, slen = 0, total_len = sizeof(u32); lock_sock(sk); if ((sl = dp->dccps_service_list) != NULL) { slen = sl->dccpsl_nr * sizeof(u32); total_len += slen; } err = -EINVAL; if (total_len > len) goto out; err = 0; if (put_user(total_len, optlen) || put_user(dp->dccps_service, optval) || (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) err = -EFAULT; out: release_sock(sk); return err; } static int do_dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct dccp_sock *dp; int val, len; if (get_user(len, optlen)) return -EFAULT; if (len < (int)sizeof(int)) return -EINVAL; dp = dccp_sk(sk); switch (optname) { case DCCP_SOCKOPT_PACKET_SIZE: DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_SERVICE: return dccp_getsockopt_service(sk, len, (__be32 __user *)optval, optlen); case DCCP_SOCKOPT_GET_CUR_MPS: val = READ_ONCE(dp->dccps_mss_cache); break; case DCCP_SOCKOPT_AVAILABLE_CCIDS: return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); case DCCP_SOCKOPT_TX_CCID: val = ccid_get_current_tx_ccid(dp); if (val < 0) return -ENOPROTOOPT; break; case DCCP_SOCKOPT_RX_CCID: val = ccid_get_current_rx_ccid(dp); if (val < 0) return -ENOPROTOOPT; break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; case DCCP_SOCKOPT_SEND_CSCOV: val = dp->dccps_pcslen; break; case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; break; case DCCP_SOCKOPT_QPOLICY_ID: val = dp->dccps_qpolicy; break; case DCCP_SOCKOPT_QPOLICY_TXQLEN: val = dp->dccps_tx_qlen; break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, len, (u32 __user *)optval, optlen); case 192 ... 255: return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, len, (u32 __user *)optval, optlen); default: return -ENOPROTOOPT; } len = sizeof(val); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, optname, optval, optlen); return do_dccp_getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL_GPL(dccp_getsockopt); static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) { struct cmsghdr *cmsg; /* * Assign an (opaque) qpolicy priority value to skb->priority. * * We are overloading this skb field for use with the qpolicy subystem. * The skb->priority is normally used for the SO_PRIORITY option, which * is initialised from sk_priority. Since the assignment of sk_priority * to skb->priority happens later (on layer 3), we overload this field * for use with queueing priorities as long as the skb is on layer 4. * The default priority value (if nothing is set) is 0. */ skb->priority = 0; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_DCCP) continue; if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) return -EINVAL; switch (cmsg->cmsg_type) { case DCCP_SCM_PRIORITY: if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) return -EINVAL; skb->priority = *(__u32 *)CMSG_DATA(cmsg); break; default: return -EINVAL; } } return 0; } int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { const struct dccp_sock *dp = dccp_sk(sk); const int flags = msg->msg_flags; const int noblock = flags & MSG_DONTWAIT; struct sk_buff *skb; int rc, size; long timeo; trace_dccp_probe(sk, len); if (len > READ_ONCE(dp->dccps_mss_cache)) return -EMSGSIZE; lock_sock(sk); timeo = sock_sndtimeo(sk, noblock); /* * We have to use sk_stream_wait_connect here to set sk_write_pending, * so that the trick in dccp_rcv_request_sent_state_process. */ /* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_release; size = sk->sk_prot->max_header + len; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (skb == NULL) goto out_release; if (dccp_qpolicy_full(sk)) { rc = -EAGAIN; goto out_discard; } if (sk->sk_state == DCCP_CLOSED) { rc = -ENOTCONN; goto out_discard; } /* We need to check dccps_mss_cache after socket is locked. */ if (len > dp->dccps_mss_cache) { rc = -EMSGSIZE; goto out_discard; } skb_reserve(skb, sk->sk_prot->max_header); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc != 0) goto out_discard; rc = dccp_msghdr_parse(msg, skb); if (rc != 0) goto out_discard; dccp_qpolicy_push(sk, skb); /* * The xmit_timer is set if the TX CCID is rate-based and will expire * when congestion control permits to release further packets into the * network. Window-based CCIDs do not use this timer. */ if (!timer_pending(&dp->dccps_xmit_timer)) dccp_write_xmit(sk); out_release: release_sock(sk); return rc ? : len; out_discard: kfree_skb(skb); goto out_release; } EXPORT_SYMBOL_GPL(dccp_sendmsg); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { const struct dccp_hdr *dh; long timeo; lock_sock(sk); if (sk->sk_state == DCCP_LISTEN) { len = -ENOTCONN; goto out; } timeo = sock_rcvtimeo(sk, nonblock); do { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); if (skb == NULL) goto verify_sock_status; dh = dccp_hdr(skb); switch (dh->dccph_type) { case DCCP_PKT_DATA: case DCCP_PKT_DATAACK: goto found_ok_skb; case DCCP_PKT_CLOSE: case DCCP_PKT_CLOSEREQ: if (!(flags & MSG_PEEK)) dccp_finish_passive_close(sk); fallthrough; case DCCP_PKT_RESET: dccp_pr_debug("found fin (%s) ok!\n", dccp_packet_name(dh->dccph_type)); len = 0; goto found_fin_ok; default: dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); sk_eat_skb(sk, skb); } verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; break; } if (sk->sk_err) { len = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) { len = 0; break; } if (sk->sk_state == DCCP_CLOSED) { if (!sock_flag(sk, SOCK_DONE)) { /* This occurs when user tries to read * from never connected socket. */ len = -ENOTCONN; break; } len = 0; break; } if (!timeo) { len = -EAGAIN; break; } if (signal_pending(current)) { len = sock_intr_errno(timeo); break; } sk_wait_data(sk, &timeo, NULL); continue; found_ok_skb: if (len > skb->len) len = skb->len; else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; if (skb_copy_datagram_msg(skb, 0, msg, len)) { /* Exception. Bailout! */ len = -EFAULT; break; } if (flags & MSG_TRUNC) len = skb->len; found_fin_ok: if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; } while (1); out: release_sock(sk); return len; } EXPORT_SYMBOL_GPL(dccp_recvmsg); int inet_dccp_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) goto out; old_state = sk->sk_state; if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != DCCP_LISTEN) { /* * FIXME: here it probably should be sk->sk_prot->listen_start * see tcp_listen_start */ err = dccp_listen_start(sk, backlog); if (err) goto out; } err = 0; out: release_sock(sk); return err; } EXPORT_SYMBOL_GPL(inet_dccp_listen); static void dccp_terminate_connection(struct sock *sk) { u8 next_state = DCCP_CLOSED; switch (sk->sk_state) { case DCCP_PASSIVE_CLOSE: case DCCP_PASSIVE_CLOSEREQ: dccp_finish_passive_close(sk); break; case DCCP_PARTOPEN: dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); fallthrough; case DCCP_OPEN: dccp_send_close(sk, 1); if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && !dccp_sk(sk)->dccps_server_timewait) next_state = DCCP_ACTIVE_CLOSEREQ; else next_state = DCCP_CLOSING; fallthrough; default: dccp_set_state(sk, next_state); } } void dccp_close(struct sock *sk, long timeout) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; u32 data_was_unread = 0; int state; lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == DCCP_LISTEN) { dccp_set_state(sk, DCCP_CLOSED); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } sk_stop_timer(sk, &dp->dccps_xmit_timer); /* * We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the *reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { data_was_unread += skb->len; __kfree_skb(skb); } /* If socket has been already reset kill it. */ if (sk->sk_state == DCCP_CLOSED) goto adjudge_to_death; if (data_was_unread) { /* Unread data was tossed, send an appropriate Reset Code */ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_set_state(sk, DCCP_CLOSED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { /* * Normal connection termination. May need to wait if there are * still packets in the TX queue that are delayed by the CCID. */ dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } /* * Flush write queue. This may be necessary in several cases: * - we have been closed by the peer but still have application data; * - abortive termination (unread data or zero linger time), * - normal termination but queue could not be flushed within time limit */ __skb_queue_purge(&sk->sk_write_queue); sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); /* * It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); /* * Now socket is owned by kernel and we acquire BH lock * to finish close. No need to check for user refs. */ local_bh_disable(); bh_lock_sock(sk); WARN_ON(sock_owned_by_user(sk)); this_cpu_inc(dccp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) goto out; if (sk->sk_state == DCCP_CLOSED) inet_csk_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); sock_put(sk); } EXPORT_SYMBOL_GPL(dccp_close); void dccp_shutdown(struct sock *sk, int how) { dccp_pr_debug("called shutdown(%x)\n", how); } EXPORT_SYMBOL_GPL(dccp_shutdown); static inline int __init dccp_mib_init(void) { dccp_statistics = alloc_percpu(struct dccp_mib); if (!dccp_statistics) return -ENOMEM; return 0; } static inline void dccp_mib_exit(void) { free_percpu(dccp_statistics); } static int thash_entries; module_param(thash_entries, int, 0444); MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); #ifdef CONFIG_IP_DCCP_DEBUG bool dccp_debug; module_param(dccp_debug, bool, 0644); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); EXPORT_SYMBOL_GPL(dccp_debug); #endif static int __init dccp_init(void) { unsigned long goal; unsigned long nr_pages = totalram_pages(); int ehash_order, bhash_order, i; int rc; BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > sizeof_field(struct sk_buff, cb)); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) goto out_fail; rc = -ENOBUFS; dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_free_hashinfo2; /* * Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ if (nr_pages >= (128 * 1024)) goal = nr_pages >> (21 - PAGE_SHIFT); else goal = nr_pages >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) ; do { unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / sizeof(struct inet_ehash_bucket); while (hash_size & (hash_size - 1)) hash_size--; dccp_hashinfo.ehash_mask = hash_size - 1; dccp_hashinfo.ehash = (struct inet_ehash_bucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); } while (!dccp_hashinfo.ehash && --ehash_order > 0); if (!dccp_hashinfo.ehash) { DCCP_CRIT("Failed to allocate DCCP established hash table"); goto out_free_bind_bucket_cachep; } for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&dccp_hashinfo)) goto out_free_dccp_ehash; bhash_order = ehash_order; do { dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / sizeof(struct inet_bind_hashbucket); if ((dccp_hashinfo.bhash_size > (64 * 1024)) && bhash_order > 0) continue; dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); } while (!dccp_hashinfo.bhash && --bhash_order >= 0); if (!dccp_hashinfo.bhash) { DCCP_CRIT("Failed to allocate DCCP bind hash table"); goto out_free_dccp_locks; } for (i = 0; i < dccp_hashinfo.bhash_size; i++) { spin_lock_init(&dccp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); } rc = dccp_mib_init(); if (rc) goto out_free_dccp_bhash; rc = dccp_ackvec_init(); if (rc) goto out_free_dccp_mib; rc = dccp_sysctl_init(); if (rc) goto out_ackvec_exit; rc = ccid_initialize_builtins(); if (rc) goto out_sysctl_exit; dccp_timestamping_init(); return 0; out_sysctl_exit: dccp_sysctl_exit(); out_ackvec_exit: dccp_ackvec_exit(); out_free_dccp_mib: dccp_mib_exit(); out_free_dccp_bhash: free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); out_free_dccp_locks: inet_ehash_locks_free(&dccp_hashinfo); out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); out_free_hashinfo2: inet_hashinfo2_free_mod(&dccp_hashinfo); out_fail: dccp_hashinfo.bhash = NULL; dccp_hashinfo.ehash = NULL; dccp_hashinfo.bind_bucket_cachep = NULL; return rc; } static void __exit dccp_fini(void) { ccid_cleanup_builtins(); dccp_mib_exit(); free_pages((unsigned long)dccp_hashinfo.bhash, get_order(dccp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket))); free_pages((unsigned long)dccp_hashinfo.ehash, get_order((dccp_hashinfo.ehash_mask + 1) * sizeof(struct inet_ehash_bucket))); inet_ehash_locks_free(&dccp_hashinfo); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_ackvec_exit(); dccp_sysctl_exit(); inet_hashinfo2_free_mod(&dccp_hashinfo); } module_init(dccp_init); module_exit(dccp_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>"); MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); |
50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_timeout.h> static const unsigned int nf_ct_generic_timeout = 600*HZ; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { struct nf_generic_net *gn = nf_generic_pernet(net); unsigned int *timeout = data; if (!timeout) timeout = &gn->timeout; if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT]) *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_GENERIC_TIMEOUT])) * HZ; else { /* Set default generic timeout. */ *timeout = gn->timeout; } return 0; } static int generic_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; if (nla_put_be32(skb, CTA_TIMEOUT_GENERIC_TIMEOUT, htonl(*timeout / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = { [CTA_TIMEOUT_GENERIC_TIMEOUT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_generic_init_net(struct net *net) { struct nf_generic_net *gn = nf_generic_pernet(net); gn->timeout = nf_ct_generic_timeout; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = { .l4proto = 255, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = generic_timeout_nlattr_to_obj, .obj_to_nlattr = generic_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_GENERIC_MAX, .obj_size = sizeof(unsigned int), .nla_policy = generic_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
58 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ #include <linux/rhashtable-types.h> #include <linux/completion.h> /* Per netns frag queues directory */ struct fqdir { /* sysctls */ long high_thresh; long low_thresh; int timeout; int max_dist; struct inet_frags *f; struct net *net; bool dead; struct rhashtable rhashtable ____cacheline_aligned_in_smp; /* Keep atomic mem on separate cachelines in structs that include it */ atomic_long_t mem ____cacheline_aligned_in_smp; struct work_struct destroy_work; struct llist_node free_list; }; /** * fragment queue flags * * @INET_FRAG_FIRST_IN: first fragment has arrived * @INET_FRAG_LAST_IN: final fragment has arrived * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable */ enum { INET_FRAG_FIRST_IN = BIT(0), INET_FRAG_LAST_IN = BIT(1), INET_FRAG_COMPLETE = BIT(2), INET_FRAG_HASH_DEAD = BIT(3), }; struct frag_v4_compare_key { __be32 saddr; __be32 daddr; u32 user; u32 vif; __be16 id; u16 protocol; }; struct frag_v6_compare_key { struct in6_addr saddr; struct in6_addr daddr; u32 user; __be32 id; u32 iif; }; /** * struct inet_frag_queue - fragment queue * * @node: rhash node * @key: keys identifying this frag. * @timer: queue expiration timer * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @rb_fragments: received fragments rb-tree root * @fragments_tail: received fragments tail * @last_run_head: the head of the last "run". see ip_fragment.c * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far * @flags: fragment queue flags * @max_size: maximum received fragment size * @fqdir: pointer to struct fqdir * @rcu: rcu head for freeing deferall */ struct inet_frag_queue { struct rhash_head node; union { struct frag_v4_compare_key v4; struct frag_v6_compare_key v6; } key; struct timer_list timer; spinlock_t lock; refcount_t refcnt; struct rb_root rb_fragments; struct sk_buff *fragments_tail; struct sk_buff *last_run_head; ktime_t stamp; int len; int meat; __u8 flags; u16 max_size; struct fqdir *fqdir; struct rcu_head rcu; }; struct inet_frags { unsigned int qsize; void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; struct rhashtable_params rhash_params; refcount_t refcnt; struct completion completion; }; int inet_frags_init(struct inet_frags *); void inet_frags_fini(struct inet_frags *); int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net); static inline void fqdir_pre_exit(struct fqdir *fqdir) { /* Prevent creation of new frags. * Pairs with READ_ONCE() in inet_frag_find(). */ WRITE_ONCE(fqdir->high_thresh, 0); /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire() * and ip6frag_expire_frag_queue(). */ WRITE_ONCE(fqdir->dead, true); } void fqdir_exit(struct fqdir *fqdir); void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); /* Free all skbs in the queue; return the sum of their truesizes. */ unsigned int inet_frag_rbtree_purge(struct rb_root *root); static inline void inet_frag_put(struct inet_frag_queue *q) { if (refcount_dec_and_test(&q->refcnt)) inet_frag_destroy(q); } /* Memory Tracking Functions. */ static inline long frag_mem_limit(const struct fqdir *fqdir) { return atomic_long_read(&fqdir->mem); } static inline void sub_frag_mem_limit(struct fqdir *fqdir, long val) { atomic_long_sub(val, &fqdir->mem); } static inline void add_frag_mem_limit(struct fqdir *fqdir, long val) { atomic_long_add(val, &fqdir->mem); } /* RFC 3168 support : * We want to check ECN values of all fragments, do detect invalid combinations. * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. */ #define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ #define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ #define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ #define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ extern const u8 ip_frag_ecn_table[16]; /* Return values of inet_frag_queue_insert() */ #define IPFRAG_OK 0 #define IPFRAG_DUP 1 #define IPFRAG_OVERLAP 2 int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, int offset, int end); void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce); struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q); #endif |
18 452 43 417 452 125 435 453 50 454 454 452 451 8 35 520 523 521 522 520 523 404 426 426 125 124 453 438 438 404 426 454 454 453 409 451 4 389 452 453 454 410 409 431 432 18 18 18 18 138 139 126 122 139 114 117 502 547 545 81 81 134 133 111 133 134 1 134 134 554 134 552 553 12 12 13 13 13 13 389 389 109 110 110 110 389 389 389 389 540 521 521 557 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 | // SPDX-License-Identifier: GPL-2.0 /* * kobject.c - library routines for handling generic kernel objects * * Copyright (c) 2002-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2006-2007 Greg Kroah-Hartman <greg@kroah.com> * Copyright (c) 2006-2007 Novell Inc. * * Please see the file Documentation/core-api/kobject.rst for critical information * about using the kobject interface. */ #include <linux/kobject.h> #include <linux/string.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/random.h> /** * kobject_namespace() - Return @kobj's namespace tag. * @kobj: kobject in question * * Returns namespace tag of @kobj if its parent has namespace ops enabled * and thus @kobj should have a namespace tag associated with it. Returns * %NULL otherwise. */ const void *kobject_namespace(struct kobject *kobj) { const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj); if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE) return NULL; return kobj->ktype->namespace(kobj); } /** * kobject_get_ownership() - Get sysfs ownership data for @kobj. * @kobj: kobject in question * @uid: kernel user ID for sysfs objects * @gid: kernel group ID for sysfs objects * * Returns initial uid/gid pair that should be used when creating sysfs * representation of given kobject. Normally used to adjust ownership of * objects in a container. */ void kobject_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) { *uid = GLOBAL_ROOT_UID; *gid = GLOBAL_ROOT_GID; if (kobj->ktype->get_ownership) kobj->ktype->get_ownership(kobj, uid, gid); } /* * populate_dir - populate directory with attributes. * @kobj: object we're working on. * * Most subsystems have a set of default attributes that are associated * with an object that registers with them. This is a helper called during * object registration that loops through the default attributes of the * subsystem and creates attributes files for them in sysfs. */ static int populate_dir(struct kobject *kobj) { struct kobj_type *t = get_ktype(kobj); struct attribute *attr; int error = 0; int i; if (t && t->default_attrs) { for (i = 0; (attr = t->default_attrs[i]) != NULL; i++) { error = sysfs_create_file(kobj, attr); if (error) break; } } return error; } static int create_dir(struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); const struct kobj_ns_type_operations *ops; int error; error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj)); if (error) return error; error = populate_dir(kobj); if (error) { sysfs_remove_dir(kobj); return error; } if (ktype) { error = sysfs_create_groups(kobj, ktype->default_groups); if (error) { sysfs_remove_dir(kobj); return error; } } /* * @kobj->sd may be deleted by an ancestor going away. Hold an * extra reference so that it stays until @kobj is gone. */ sysfs_get(kobj->sd); /* * If @kobj has ns_ops, its children need to be filtered based on * their namespace tags. Enable namespace support on @kobj->sd. */ ops = kobj_child_ns_ops(kobj); if (ops) { BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE); BUG_ON(ops->type >= KOBJ_NS_TYPES); BUG_ON(!kobj_ns_type_registered(ops->type)); sysfs_enable_ns(kobj->sd); } return 0; } static int get_kobj_path_length(const struct kobject *kobj) { int length = 1; const struct kobject *parent = kobj; /* walk up the ancestors until we hit the one pointing to the * root. * Add 1 to strlen for leading '/' of each level. */ do { if (kobject_name(parent) == NULL) return 0; length += strlen(kobject_name(parent)) + 1; parent = parent->parent; } while (parent); return length; } static int fill_kobj_path(const struct kobject *kobj, char *path, int length) { const struct kobject *parent; --length; for (parent = kobj; parent; parent = parent->parent) { int cur = strlen(kobject_name(parent)); /* back up enough to print this name with '/' */ length -= cur; if (length <= 0) return -EINVAL; memcpy(path + length, kobject_name(parent), cur); *(path + --length) = '/'; } pr_debug("kobject: '%s' (%p): %s: path = '%s'\n", kobject_name(kobj), kobj, __func__, path); return 0; } /** * kobject_get_path() - Allocate memory and fill in the path for @kobj. * @kobj: kobject in question, with which to build the path * @gfp_mask: the allocation type used to allocate the path * * Return: The newly allocated memory, caller must free with kfree(). */ char *kobject_get_path(const struct kobject *kobj, gfp_t gfp_mask) { char *path; int len; retry: len = get_kobj_path_length(kobj); if (len == 0) return NULL; path = kzalloc(len, gfp_mask); if (!path) return NULL; if (fill_kobj_path(kobj, path, len)) { kfree(path); goto retry; } return path; } EXPORT_SYMBOL_GPL(kobject_get_path); /* add the kobject to its kset's list */ static void kobj_kset_join(struct kobject *kobj) { if (!kobj->kset) return; kset_get(kobj->kset); spin_lock(&kobj->kset->list_lock); list_add_tail(&kobj->entry, &kobj->kset->list); spin_unlock(&kobj->kset->list_lock); } /* remove the kobject from its kset's list */ static void kobj_kset_leave(struct kobject *kobj) { if (!kobj->kset) return; spin_lock(&kobj->kset->list_lock); list_del_init(&kobj->entry); spin_unlock(&kobj->kset->list_lock); kset_put(kobj->kset); } static void kobject_init_internal(struct kobject *kobj) { if (!kobj) return; kref_init(&kobj->kref); INIT_LIST_HEAD(&kobj->entry); kobj->state_in_sysfs = 0; kobj->state_add_uevent_sent = 0; kobj->state_remove_uevent_sent = 0; kobj->state_initialized = 1; } static int kobject_add_internal(struct kobject *kobj) { int error = 0; struct kobject *parent; if (!kobj) return -ENOENT; if (!kobj->name || !kobj->name[0]) { WARN(1, "kobject: (%p): attempted to be registered with empty name!\n", kobj); return -EINVAL; } parent = kobject_get(kobj->parent); /* join kset if set, use it as parent if we do not already have one */ if (kobj->kset) { if (!parent) parent = kobject_get(&kobj->kset->kobj); kobj_kset_join(kobj); kobj->parent = parent; } pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n", kobject_name(kobj), kobj, __func__, parent ? kobject_name(parent) : "<NULL>", kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>"); error = create_dir(kobj); if (error) { kobj_kset_leave(kobj); kobject_put(parent); kobj->parent = NULL; /* be noisy on error issues */ if (error == -EEXIST) pr_err("%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n", __func__, kobject_name(kobj)); else pr_err("%s failed for %s (error: %d parent: %s)\n", __func__, kobject_name(kobj), error, parent ? kobject_name(parent) : "'none'"); } else kobj->state_in_sysfs = 1; return error; } /** * kobject_set_name_vargs() - Set the name of a kobject. * @kobj: struct kobject to set the name of * @fmt: format string used to build the name * @vargs: vargs to format the string. */ int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs) { const char *s; if (kobj->name && !fmt) return 0; s = kvasprintf_const(GFP_KERNEL, fmt, vargs); if (!s) return -ENOMEM; /* * ewww... some of these buggers have '/' in the name ... If * that's the case, we need to make sure we have an actual * allocated copy to modify, since kvasprintf_const may have * returned something from .rodata. */ if (strchr(s, '/')) { char *t; t = kstrdup(s, GFP_KERNEL); kfree_const(s); if (!t) return -ENOMEM; strreplace(t, '/', '!'); s = t; } kfree_const(kobj->name); kobj->name = s; return 0; } /** * kobject_set_name() - Set the name of a kobject. * @kobj: struct kobject to set the name of * @fmt: format string used to build the name * * This sets the name of the kobject. If you have already added the * kobject to the system, you must call kobject_rename() in order to * change the name of the kobject. */ int kobject_set_name(struct kobject *kobj, const char *fmt, ...) { va_list vargs; int retval; va_start(vargs, fmt); retval = kobject_set_name_vargs(kobj, fmt, vargs); va_end(vargs); return retval; } EXPORT_SYMBOL(kobject_set_name); /** * kobject_init() - Initialize a kobject structure. * @kobj: pointer to the kobject to initialize * @ktype: pointer to the ktype for this kobject. * * This function will properly initialize a kobject such that it can then * be passed to the kobject_add() call. * * After this function is called, the kobject MUST be cleaned up by a call * to kobject_put(), not by a call to kfree directly to ensure that all of * the memory is cleaned up properly. */ void kobject_init(struct kobject *kobj, struct kobj_type *ktype) { char *err_str; if (!kobj) { err_str = "invalid kobject pointer!"; goto error; } if (!ktype) { err_str = "must have a ktype to be initialized properly!\n"; goto error; } if (kobj->state_initialized) { /* do not error out as sometimes we can recover */ pr_err("kobject (%p): tried to init an initialized object, something is seriously wrong.\n", kobj); dump_stack(); } kobject_init_internal(kobj); kobj->ktype = ktype; return; error: pr_err("kobject (%p): %s\n", kobj, err_str); dump_stack(); } EXPORT_SYMBOL(kobject_init); static __printf(3, 0) int kobject_add_varg(struct kobject *kobj, struct kobject *parent, const char *fmt, va_list vargs) { int retval; retval = kobject_set_name_vargs(kobj, fmt, vargs); if (retval) { pr_err("kobject: can not set name properly!\n"); return retval; } kobj->parent = parent; return kobject_add_internal(kobj); } /** * kobject_add() - The main kobject add function. * @kobj: the kobject to add * @parent: pointer to the parent of the kobject. * @fmt: format to name the kobject with. * * The kobject name is set and added to the kobject hierarchy in this * function. * * If @parent is set, then the parent of the @kobj will be set to it. * If @parent is NULL, then the parent of the @kobj will be set to the * kobject associated with the kset assigned to this kobject. If no kset * is assigned to the kobject, then the kobject will be located in the * root of the sysfs tree. * * Note, no "add" uevent will be created with this call, the caller should set * up all of the necessary sysfs files for the object and then call * kobject_uevent() with the UEVENT_ADD parameter to ensure that * userspace is properly notified of this kobject's creation. * * Return: If this function returns an error, kobject_put() must be * called to properly clean up the memory associated with the * object. Under no instance should the kobject that is passed * to this function be directly freed with a call to kfree(), * that can leak memory. * * If this function returns success, kobject_put() must also be called * in order to properly clean up the memory associated with the object. * * In short, once this function is called, kobject_put() MUST be called * when the use of the object is finished in order to properly free * everything. */ int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...) { va_list args; int retval; if (!kobj) return -EINVAL; if (!kobj->state_initialized) { pr_err("kobject '%s' (%p): tried to add an uninitialized object, something is seriously wrong.\n", kobject_name(kobj), kobj); dump_stack(); return -EINVAL; } va_start(args, fmt); retval = kobject_add_varg(kobj, parent, fmt, args); va_end(args); return retval; } EXPORT_SYMBOL(kobject_add); /** * kobject_init_and_add() - Initialize a kobject structure and add it to * the kobject hierarchy. * @kobj: pointer to the kobject to initialize * @ktype: pointer to the ktype for this kobject. * @parent: pointer to the parent of this kobject. * @fmt: the name of the kobject. * * This function combines the call to kobject_init() and kobject_add(). * * If this function returns an error, kobject_put() must be called to * properly clean up the memory associated with the object. This is the * same type of error handling after a call to kobject_add() and kobject * lifetime rules are the same here. */ int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...) { va_list args; int retval; kobject_init(kobj, ktype); va_start(args, fmt); retval = kobject_add_varg(kobj, parent, fmt, args); va_end(args); return retval; } EXPORT_SYMBOL_GPL(kobject_init_and_add); /** * kobject_rename() - Change the name of an object. * @kobj: object in question. * @new_name: object's new name * * It is the responsibility of the caller to provide mutual * exclusion between two different calls of kobject_rename * on the same kobject and to ensure that new_name is valid and * won't conflict with other kobjects. */ int kobject_rename(struct kobject *kobj, const char *new_name) { int error = 0; const char *devpath = NULL; const char *dup_name = NULL, *name; char *devpath_string = NULL; char *envp[2]; kobj = kobject_get(kobj); if (!kobj) return -EINVAL; if (!kobj->parent) { kobject_put(kobj); return -EINVAL; } devpath = kobject_get_path(kobj, GFP_KERNEL); if (!devpath) { error = -ENOMEM; goto out; } devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL); if (!devpath_string) { error = -ENOMEM; goto out; } sprintf(devpath_string, "DEVPATH_OLD=%s", devpath); envp[0] = devpath_string; envp[1] = NULL; name = dup_name = kstrdup_const(new_name, GFP_KERNEL); if (!name) { error = -ENOMEM; goto out; } error = sysfs_rename_dir_ns(kobj, new_name, kobject_namespace(kobj)); if (error) goto out; /* Install the new kobject name */ dup_name = kobj->name; kobj->name = name; /* This function is mostly/only used for network interface. * Some hotplug package track interfaces by their name and * therefore want to know when the name is changed by the user. */ kobject_uevent_env(kobj, KOBJ_MOVE, envp); out: kfree_const(dup_name); kfree(devpath_string); kfree(devpath); kobject_put(kobj); return error; } EXPORT_SYMBOL_GPL(kobject_rename); /** * kobject_move() - Move object to another parent. * @kobj: object in question. * @new_parent: object's new parent (can be NULL) */ int kobject_move(struct kobject *kobj, struct kobject *new_parent) { int error; struct kobject *old_parent; const char *devpath = NULL; char *devpath_string = NULL; char *envp[2]; kobj = kobject_get(kobj); if (!kobj) return -EINVAL; new_parent = kobject_get(new_parent); if (!new_parent) { if (kobj->kset) new_parent = kobject_get(&kobj->kset->kobj); } /* old object path */ devpath = kobject_get_path(kobj, GFP_KERNEL); if (!devpath) { error = -ENOMEM; goto out; } devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL); if (!devpath_string) { error = -ENOMEM; goto out; } sprintf(devpath_string, "DEVPATH_OLD=%s", devpath); envp[0] = devpath_string; envp[1] = NULL; error = sysfs_move_dir_ns(kobj, new_parent, kobject_namespace(kobj)); if (error) goto out; old_parent = kobj->parent; kobj->parent = new_parent; new_parent = NULL; kobject_put(old_parent); kobject_uevent_env(kobj, KOBJ_MOVE, envp); out: kobject_put(new_parent); kobject_put(kobj); kfree(devpath_string); kfree(devpath); return error; } EXPORT_SYMBOL_GPL(kobject_move); static void __kobject_del(struct kobject *kobj) { struct kernfs_node *sd; const struct kobj_type *ktype; sd = kobj->sd; ktype = get_ktype(kobj); if (ktype) sysfs_remove_groups(kobj, ktype->default_groups); /* send "remove" if the caller did not do it but sent "add" */ if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) { pr_debug("kobject: '%s' (%p): auto cleanup 'remove' event\n", kobject_name(kobj), kobj); kobject_uevent(kobj, KOBJ_REMOVE); } sysfs_remove_dir(kobj); sysfs_put(sd); kobj->state_in_sysfs = 0; kobj_kset_leave(kobj); kobj->parent = NULL; } /** * kobject_del() - Unlink kobject from hierarchy. * @kobj: object. * * This is the function that should be called to delete an object * successfully added via kobject_add(). */ void kobject_del(struct kobject *kobj) { struct kobject *parent; if (!kobj) return; parent = kobj->parent; __kobject_del(kobj); kobject_put(parent); } EXPORT_SYMBOL(kobject_del); /** * kobject_get() - Increment refcount for object. * @kobj: object. */ struct kobject *kobject_get(struct kobject *kobj) { if (kobj) { if (!kobj->state_initialized) WARN(1, KERN_WARNING "kobject: '%s' (%p): is not initialized, yet kobject_get() is being called.\n", kobject_name(kobj), kobj); kref_get(&kobj->kref); } return kobj; } EXPORT_SYMBOL(kobject_get); struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj) { if (!kobj) return NULL; if (!kref_get_unless_zero(&kobj->kref)) kobj = NULL; return kobj; } EXPORT_SYMBOL(kobject_get_unless_zero); /* * kobject_cleanup - free kobject resources. * @kobj: object to cleanup */ static void kobject_cleanup(struct kobject *kobj) { struct kobject *parent = kobj->parent; struct kobj_type *t = get_ktype(kobj); const char *name = kobj->name; pr_debug("kobject: '%s' (%p): %s, parent %p\n", kobject_name(kobj), kobj, __func__, kobj->parent); if (t && !t->release) pr_debug("kobject: '%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n", kobject_name(kobj), kobj); /* remove from sysfs if the caller did not do it */ if (kobj->state_in_sysfs) { pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n", kobject_name(kobj), kobj); __kobject_del(kobj); } else { /* avoid dropping the parent reference unnecessarily */ parent = NULL; } if (t && t->release) { pr_debug("kobject: '%s' (%p): calling ktype release\n", kobject_name(kobj), kobj); t->release(kobj); } /* free name if we allocated it */ if (name) { pr_debug("kobject: '%s': free name\n", name); kfree_const(name); } kobject_put(parent); } #ifdef CONFIG_DEBUG_KOBJECT_RELEASE static void kobject_delayed_cleanup(struct work_struct *work) { kobject_cleanup(container_of(to_delayed_work(work), struct kobject, release)); } #endif static void kobject_release(struct kref *kref) { struct kobject *kobj = container_of(kref, struct kobject, kref); #ifdef CONFIG_DEBUG_KOBJECT_RELEASE unsigned long delay = HZ + HZ * (get_random_int() & 0x3); pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n", kobject_name(kobj), kobj, __func__, kobj->parent, delay); INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup); schedule_delayed_work(&kobj->release, delay); #else kobject_cleanup(kobj); #endif } /** * kobject_put() - Decrement refcount for object. * @kobj: object. * * Decrement the refcount, and if 0, call kobject_cleanup(). */ void kobject_put(struct kobject *kobj) { if (kobj) { if (!kobj->state_initialized) WARN(1, KERN_WARNING "kobject: '%s' (%p): is not initialized, yet kobject_put() is being called.\n", kobject_name(kobj), kobj); kref_put(&kobj->kref, kobject_release); } } EXPORT_SYMBOL(kobject_put); static void dynamic_kobj_release(struct kobject *kobj) { pr_debug("kobject: (%p): %s\n", kobj, __func__); kfree(kobj); } static struct kobj_type dynamic_kobj_ktype = { .release = dynamic_kobj_release, .sysfs_ops = &kobj_sysfs_ops, }; /** * kobject_create() - Create a struct kobject dynamically. * * This function creates a kobject structure dynamically and sets it up * to be a "dynamic" kobject with a default release function set up. * * If the kobject was not able to be created, NULL will be returned. * The kobject structure returned from here must be cleaned up with a * call to kobject_put() and not kfree(), as kobject_init() has * already been called on this structure. */ struct kobject *kobject_create(void) { struct kobject *kobj; kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); if (!kobj) return NULL; kobject_init(kobj, &dynamic_kobj_ktype); return kobj; } /** * kobject_create_and_add() - Create a struct kobject dynamically and * register it with sysfs. * @name: the name for the kobject * @parent: the parent kobject of this kobject, if any. * * This function creates a kobject structure dynamically and registers it * with sysfs. When you are finished with this structure, call * kobject_put() and the structure will be dynamically freed when * it is no longer being used. * * If the kobject was not able to be created, NULL will be returned. */ struct kobject *kobject_create_and_add(const char *name, struct kobject *parent) { struct kobject *kobj; int retval; kobj = kobject_create(); if (!kobj) return NULL; retval = kobject_add(kobj, parent, "%s", name); if (retval) { pr_warn("%s: kobject_add error: %d\n", __func__, retval); kobject_put(kobj); kobj = NULL; } return kobj; } EXPORT_SYMBOL_GPL(kobject_create_and_add); /** * kset_init() - Initialize a kset for use. * @k: kset */ void kset_init(struct kset *k) { kobject_init_internal(&k->kobj); INIT_LIST_HEAD(&k->list); spin_lock_init(&k->list_lock); } /* default kobject attribute operations */ static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct kobj_attribute *kattr; ssize_t ret = -EIO; kattr = container_of(attr, struct kobj_attribute, attr); if (kattr->show) ret = kattr->show(kobj, kattr, buf); return ret; } static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct kobj_attribute *kattr; ssize_t ret = -EIO; kattr = container_of(attr, struct kobj_attribute, attr); if (kattr->store) ret = kattr->store(kobj, kattr, buf, count); return ret; } const struct sysfs_ops kobj_sysfs_ops = { .show = kobj_attr_show, .store = kobj_attr_store, }; EXPORT_SYMBOL_GPL(kobj_sysfs_ops); /** * kset_register() - Initialize and add a kset. * @k: kset. */ int kset_register(struct kset *k) { int err; if (!k) return -EINVAL; if (!k->kobj.ktype) { pr_err("must have a ktype to be initialized properly!\n"); return -EINVAL; } kset_init(k); err = kobject_add_internal(&k->kobj); if (err) return err; kobject_uevent(&k->kobj, KOBJ_ADD); return 0; } EXPORT_SYMBOL(kset_register); /** * kset_unregister() - Remove a kset. * @k: kset. */ void kset_unregister(struct kset *k) { if (!k) return; kobject_del(&k->kobj); kobject_put(&k->kobj); } EXPORT_SYMBOL(kset_unregister); /** * kset_find_obj() - Search for object in kset. * @kset: kset we're looking in. * @name: object's name. * * Lock kset via @kset->subsys, and iterate over @kset->list, * looking for a matching kobject. If matching object is found * take a reference and return the object. */ struct kobject *kset_find_obj(struct kset *kset, const char *name) { struct kobject *k; struct kobject *ret = NULL; spin_lock(&kset->list_lock); list_for_each_entry(k, &kset->list, entry) { if (kobject_name(k) && !strcmp(kobject_name(k), name)) { ret = kobject_get_unless_zero(k); break; } } spin_unlock(&kset->list_lock); return ret; } EXPORT_SYMBOL_GPL(kset_find_obj); static void kset_release(struct kobject *kobj) { struct kset *kset = container_of(kobj, struct kset, kobj); pr_debug("kobject: '%s' (%p): %s\n", kobject_name(kobj), kobj, __func__); kfree(kset); } static void kset_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) { if (kobj->parent) kobject_get_ownership(kobj->parent, uid, gid); } static struct kobj_type kset_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = kset_release, .get_ownership = kset_get_ownership, }; /** * kset_create() - Create a struct kset dynamically. * * @name: the name for the kset * @uevent_ops: a struct kset_uevent_ops for the kset * @parent_kobj: the parent kobject of this kset, if any. * * This function creates a kset structure dynamically. This structure can * then be registered with the system and show up in sysfs with a call to * kset_register(). When you are finished with this structure, if * kset_register() has been called, call kset_unregister() and the * structure will be dynamically freed when it is no longer being used. * * If the kset was not able to be created, NULL will be returned. */ static struct kset *kset_create(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj) { struct kset *kset; int retval; kset = kzalloc(sizeof(*kset), GFP_KERNEL); if (!kset) return NULL; retval = kobject_set_name(&kset->kobj, "%s", name); if (retval) { kfree(kset); return NULL; } kset->uevent_ops = uevent_ops; kset->kobj.parent = parent_kobj; /* * The kobject of this kset will have a type of kset_ktype and belong to * no kset itself. That way we can properly free it when it is * finished being used. */ kset->kobj.ktype = &kset_ktype; kset->kobj.kset = NULL; return kset; } /** * kset_create_and_add() - Create a struct kset dynamically and add it to sysfs. * * @name: the name for the kset * @uevent_ops: a struct kset_uevent_ops for the kset * @parent_kobj: the parent kobject of this kset, if any. * * This function creates a kset structure dynamically and registers it * with sysfs. When you are finished with this structure, call * kset_unregister() and the structure will be dynamically freed when it * is no longer being used. * * If the kset was not able to be created, NULL will be returned. */ struct kset *kset_create_and_add(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj) { struct kset *kset; int error; kset = kset_create(name, uevent_ops, parent_kobj); if (!kset) return NULL; error = kset_register(kset); if (error) { kfree(kset); return NULL; } return kset; } EXPORT_SYMBOL_GPL(kset_create_and_add); static DEFINE_SPINLOCK(kobj_ns_type_lock); static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) { enum kobj_ns_type type = ops->type; int error; spin_lock(&kobj_ns_type_lock); error = -EINVAL; if (type >= KOBJ_NS_TYPES) goto out; error = -EINVAL; if (type <= KOBJ_NS_TYPE_NONE) goto out; error = -EBUSY; if (kobj_ns_ops_tbl[type]) goto out; error = 0; kobj_ns_ops_tbl[type] = ops; out: spin_unlock(&kobj_ns_type_lock); return error; } int kobj_ns_type_registered(enum kobj_ns_type type) { int registered = 0; spin_lock(&kobj_ns_type_lock); if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES)) registered = kobj_ns_ops_tbl[type] != NULL; spin_unlock(&kobj_ns_type_lock); return registered; } const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent) { const struct kobj_ns_type_operations *ops = NULL; if (parent && parent->ktype && parent->ktype->child_ns_type) ops = parent->ktype->child_ns_type(parent); return ops; } const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj) { return kobj_child_ns_ops(kobj->parent); } bool kobj_ns_current_may_mount(enum kobj_ns_type type) { bool may_mount = true; spin_lock(&kobj_ns_type_lock); if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && kobj_ns_ops_tbl[type]) may_mount = kobj_ns_ops_tbl[type]->current_may_mount(); spin_unlock(&kobj_ns_type_lock); return may_mount; } void *kobj_ns_grab_current(enum kobj_ns_type type) { void *ns = NULL; spin_lock(&kobj_ns_type_lock); if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && kobj_ns_ops_tbl[type]) ns = kobj_ns_ops_tbl[type]->grab_current_ns(); spin_unlock(&kobj_ns_type_lock); return ns; } EXPORT_SYMBOL_GPL(kobj_ns_grab_current); const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk) { const void *ns = NULL; spin_lock(&kobj_ns_type_lock); if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && kobj_ns_ops_tbl[type]) ns = kobj_ns_ops_tbl[type]->netlink_ns(sk); spin_unlock(&kobj_ns_type_lock); return ns; } const void *kobj_ns_initial(enum kobj_ns_type type) { const void *ns = NULL; spin_lock(&kobj_ns_type_lock); if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && kobj_ns_ops_tbl[type]) ns = kobj_ns_ops_tbl[type]->initial_ns(); spin_unlock(&kobj_ns_type_lock); return ns; } void kobj_ns_drop(enum kobj_ns_type type, void *ns) { spin_lock(&kobj_ns_type_lock); if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns) kobj_ns_ops_tbl[type]->drop_ns(ns); spin_unlock(&kobj_ns_type_lock); } EXPORT_SYMBOL_GPL(kobj_ns_drop); |
381 382 382 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal.h: mm/ internal definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/tracepoint-defs.h> /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints * about IO, FS and watermark checking while ignoring placement * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_ATOMIC) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) /* Control allocation cpuset and node placement constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) void page_writeback_init(void); /* * This is a file-backed mapping, and is about to be memory mapped - invoke its * mmap hook and safely handle error conditions. On error, VMA hooks will be * mutated. * * @file: File which backs the mapping. * @vma: VMA which we are mapping. * * Returns: 0 if success, error otherwise. */ int mmap_file(struct file *file, struct vm_area_struct *vma); /* * If the VMA has a close hook then close it, and since closing it might leave * it in an inconsistent state which makes the use of any hooks suspect, clear * them down by installing dummy empty hooks. */ void vma_close(struct vm_area_struct *vma); vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_size); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); force_page_cache_ra(&ractl, nr_to_read); } unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, pgoff_t end, struct pagevec *pvec, pgoff_t *indices); /** * page_evictable - test whether a page is evictable * @page: the page to test * * Test whether page is evictable--i.e., should be placed on active/inactive * lists vs unevictable list. * * Reasons page might not be evictable: * (1) page's mapping marked unevictable * (2) page is part of an mlocked VMA * */ static inline bool page_evictable(struct page *page) { bool ret; /* Prevent address_space of inode and swap cache from being freed */ rcu_read_lock(); ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); rcu_read_unlock(); return ret; } /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } extern unsigned long highest_memmap_pfn; /* * Maximum number of reclaim retries without progress before the OOM * killer is consider the only way forward. */ #define MAX_RECLAIM_RETRIES 16 /* * in mm/vmscan.c: */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); /* * in mm/rmap.c: */ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/memcontrol.c: */ extern bool cgroup_memory_nokmem; /* * in mm/page_alloc.c */ /* * Structure for holding the mostly immutable allocation parameters passed * between functions involved in allocations, including the alloc_pages* * family of functions. * * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages() and then never change. * * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; /* * highest_zoneidx represents highest usable zone index of * the allocation request. Due to the nature of the zone, * memory on lower zone than the highest_zoneidx will be * protected by lowmem_reserve[highest_zoneidx]. * * highest_zoneidx is also used by reclaim/compaction to limit * the target zone since higher zone than this index cannot be * usable for this allocation request. */ enum zone_type highest_zoneidx; bool spread_dirty_pages; }; /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). * * 1) Any buddy B1 will have an order O twin B2 which satisfies * the following equation: * B2 = B1 ^ (1 << O) * For example, if the starting buddy (buddy2) is #8 its order * 1 buddy is #10: * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 * * 2) Any buddy B will have an order O+1 parent P which * satisfies the following equation: * P = B & ~(1 << O) * * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ static inline unsigned long __find_buddy_pfn(unsigned long page_pfn, unsigned int order) { return page_pfn ^ (1 << order); } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone); static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone) { if (zone->contiguous) return pfn_to_page(start_pfn); return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list); extern void zone_pcp_update(struct zone *zone, int cpu_online); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* * in mm/compaction.c */ /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts * at the end of a zone and migrate_pfn begins at the start. Movable pages * are moved to the end of a zone during a compaction run and the run * completes when free_pfn <= migrate_pfn */ struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ /* * Acts as an in/out parameter to page isolation for migration. * isolate_migratepages uses it as a search base. * isolate_migratepages_block will update the value to the next pfn * after the last isolated one. */ unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; unsigned long total_free_scanned; unsigned short fast_search_fail;/* failures to use free list searches */ short search_order; /* order to start a fast search at */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ bool rescan; /* Rescanning the same pageblock */ bool alloc_contig; /* alloc_contig_range allocation */ }; /* * Used in direct compaction when a page should be taken from the freelists * immediately when one is created during the free path. */ struct capture_control { struct compact_control *cc; struct page *page; }; unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); #endif int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool only_stealable, bool *can_steal); /* * This function returns the order of a free page in the buddy system. In * general, page_zone(page)->lock must be held by the caller to prevent the * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use buddy_order_unsafe() below. */ static inline unsigned int buddy_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); } /* * Like buddy_order(), but for callers who cannot afford to hold the zone lock. * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * * READ_ONCE is used so that if the caller assigns the result into a local * variable and e.g. tests it for valid range before using, the compiler cannot * decide to remove the variable and inline the page_private(page) multiple * times, potentially observing different values in the tests and the actual * use of the result. */ #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) /* * These three helpers classifies VMAs for virtual memory accounting. */ /* * Executable code area - executable, not writable, not stack */ static inline bool is_exec_mapping(vm_flags_t flags) { return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } /* * Stack area - automatically grows in one direction * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. */ static inline bool is_stack_mapping(vm_flags_t flags) { return (flags & VM_STACK) == VM_STACK; } /* * Data area - private, writable, not stack */ static inline bool is_data_mapping(vm_flags_t flags) { return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev); void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); #ifdef CONFIG_MMU extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); static inline void munlock_vma_pages_all(struct vm_area_struct *vma) { munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); } /* * must be called with vma's mmap_lock held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); extern unsigned int munlock_vma_page(struct page *page); extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); /* * Clear the page's PageMlocked(). This can be useful in a situation where * we want to unconditionally remove a page from the pagecache -- e.g., * on truncation or freeing. * * It is legal to call this function for any page, mlocked or not. * If called for a page that is still mapped by mlocked vmas, all we do * is revert to lazy LRU behaviour -- semantics are not broken. */ extern void clear_page_mlock(struct page *page); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* * At what user virtual address is page expected in vma? * Returns -EFAULT if all of the page is outside the range of vma. * If page is a compound head, the entire compound page is considered. */ static inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff; unsigned long address; VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ pgoff = page_to_pgoff(page); if (pgoff >= vma->vm_pgoff) { address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address >= vma->vm_end) address = -EFAULT; } else if (PageHead(page) && pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) { /* Test above avoids possibility of wrap to 0 on 32-bit */ address = vma->vm_start; } else { address = -EFAULT; } return address; } /* * Then at what user virtual address will none of the page be found in vma? * Assumes that vma_address() already returned a good starting address. * If page is a compound head, the entire compound page is considered. */ static inline unsigned long vma_address_end(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff; unsigned long address; VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ pgoff = page_to_pgoff(page) + compound_nr(page); address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address > vma->vm_end) address = vma->vm_end; return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, struct file *fpin) { int flags = vmf->flags; if (fpin) return fpin; /* * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or * anything, so we only pin the file and drop the mmap_lock if only * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. */ if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); mmap_read_unlock(vmf->vma->vm_mm); } return fpin; } #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } #endif /* !CONFIG_MMU */ /* * Return the mem_map entry representing the 'offset' subpage within * the maximally aligned gigantic page 'base'. Handle any discontiguity * in the mem_map at MAX_ORDER_NR_PAGES boundaries. */ static inline struct page *mem_map_offset(struct page *base, int offset) { if (unlikely(offset >= MAX_ORDER_NR_PAGES)) return nth_page(base, offset); return base + offset; } /* * Iterator over all subpages within the maximally aligned gigantic * page 'base'. Handle any discontiguity in the mem_map. */ static inline struct page *mem_map_next(struct page *iter, struct page *base, int offset) { if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { unsigned long pfn = page_to_pfn(base) + offset; if (!pfn_valid(pfn)) return NULL; return pfn_to_page(pfn); } return iter + 1; } /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, MMINIT_TRACE }; #ifdef CONFIG_DEBUG_MEMORY_INIT extern int mminit_loglevel; #define mminit_dprintk(level, prefix, fmt, arg...) \ do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ } while (0) extern void mminit_verify_pageflags_layout(void); extern void mminit_verify_zonelist(void); #else static inline void mminit_dprintk(enum mminit_level level, const char *prefix, const char *fmt, ...) { } static inline void mminit_verify_pageflags_layout(void) { } static inline void mminit_verify_zonelist(void) { } #endif /* CONFIG_DEBUG_MEMORY_INIT */ /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ #if defined(CONFIG_SPARSEMEM) extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, unsigned long *end_pfn); #else static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, unsigned long *end_pfn) { } #endif /* CONFIG_SPARSEMEM */ #define NODE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_FULL -1 #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } static inline int find_next_best_node(int node, nodemask_t *used_node_mask) { return NUMA_NO_NODE; } #endif extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN #define ALLOC_WMARK_LOW WMARK_LOW #define ALLOC_WMARK_HIGH WMARK_HIGH #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ /* Mask to get the watermark bits */ #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) /* * Only MMU archs have async oom victim reclaim - aka oom_reaper so we * cannot assume a reduced access to memory reserves is sufficient for * !MMU */ #ifdef CONFIG_MMU #define ALLOC_OOM 0x08 #else #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif #define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access * to 25% of the min watermark or * 62.5% if __GFP_HIGH is set. */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ #else #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; /* * only for MM internal work items which do not depend on * any allocations or locks which might depend on allocations */ extern struct workqueue_struct *mm_percpu_wq; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { } static inline void try_to_unmap_flush_dirty(void) { } static inline void flush_tlb_batched_pending(struct mm_struct *mm) { } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; static inline bool is_migrate_highatomic(enum migratetype migratetype) { return migratetype == MIGRATE_HIGHATOMIC; } static inline bool is_migrate_highatomic_page(struct page *page) { return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; } void setup_zone_pageset(struct zone *zone); struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; }; /* * mm/vmalloc.c */ #ifdef CONFIG_MMU int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); #else static inline int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { return -EINVAL; } #endif void vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); #endif /* __MM_INTERNAL_H */ |
7 4 3 3 23 20 3 23 6 5 11 2 12 1 1 10 2 1 3 5 2 3 1 1 23 23 11 2 2 8 6 3 17 16 1 16 7 296 138 155 235 58 135 3 2 2 2 1 1 1 1 1 2 1 2 2 1 2 2 1 2 2 1 1 2 1 1 1 1 1 1 1 1 4 2 2 2 1 1 2 23 2 1 1 4 1 1 4 7 1 1 1 1 2 2 2 2 2 1 3 2 1 2 4 2 2 2 16 12 5 9 7 1 2 5 11 12 2 4 2 2 1 2 1 1 2 1 1 1 1 9 62 2 7 1 1 2 1 3 2 1 2 1 180 54 57 4 379 2 84 296 221 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 BSD socket options interface * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_sockglue.c * * FIXME: Make the setsockopt code POSIX compliant: That is * * o Truncate getsockopt returns * o Return an optlen of the truncated length if need be * * Changes: * David L Stevens <dlstevens@us.ibm.com>: * - added multicast source filtering API for MLDv2 */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> #include <linux/uaccess.h> struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (sel >= 0 && !new_ra) return -ENOMEM; write_lock_bh(&ip6_ra_lock); for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; write_unlock_bh(&ip6_ra_lock); sock_put(sk); kfree(ra); return 0; } } if (!new_ra) { write_unlock_bh(&ip6_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->sel = sel; new_ra->next = ra; *rap = new_ra; sock_hold(sk); write_unlock_bh(&ip6_ra_lock); return 0; } struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { if (inet_sk(sk)->is_icsk) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, opt); sk_dst_reset(sk); return opt; } static bool setsockopt_needs_rtnl(int optname) { switch (optname) { case IPV6_ADDRFORM: case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_MSFILTER: return true; } return false; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen < sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_ipv6_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; int omode, add; int ret; ret = copy_group_source_from_sockptr(&greqs, optval, optlen); if (ret) return ret; if (greqs.gsr_group.ss_family != AF_INET6 || greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct sockaddr_in6 *psin6; int retv; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) return retv; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip6_mc_source(add, omode, sk, &greqs); } static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf; int ret; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffffU || gsf->gf_numsrc > sysctl_mld_max_msf) goto out_free_gsf; ret = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; } static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; int n; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; n = gf32->gf_numsrc; if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) goto out_free_p; ret = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); return ret; } static int ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct sockaddr_in6 *psin6; struct group_req greq; if (optlen < sizeof(greq)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; if (greq.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, greq.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); } static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req gr32; struct sockaddr_in6 *psin6; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; if (gr32.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&gr32.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, gr32.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); } static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_opt_hdr *new = NULL; struct net *net = sock_net(sk); struct ipv6_txoptions *opt; int err; /* hop-by-hop / destination options are privileged option */ if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option * length, per RFC3542. */ if (optlen > 0) { if (sockptr_is_null(optval)) return -EINVAL; if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) return -EINVAL; new = memdup_sockptr(optval, optlen); if (IS_ERR(new)) return PTR_ERR(new); if (unlikely(ipv6_optlen(new) > optlen)) { kfree(new); return -EINVAL; } } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); opt = ipv6_renew_options(sk, opt, optname, new); kfree(new); if (IS_ERR(opt)) return PTR_ERR(opt); /* routing header option needs extra check */ err = -EINVAL; if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) goto sticky_done; break; #endif case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } default: goto sticky_done; } } err = 0; opt = ipv6_update_options(sk, opt); sticky_done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } return err; } static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int val, valbool; int retv = -ENOPROTOOPT; bool needs_rtnl = setsockopt_needs_rtnl(optname); if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; } valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); if (needs_rtnl) rtnl_lock(); lock_sock(sk); /* Another thread has converted the socket into IPv4 with * IPV6_ADDRFORM concurrently. */ if (unlikely(sk->sk_family != AF_INET6)) goto unlock; switch (optname) { case IPV6_ADDRFORM: if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { if (sk->sk_type == SOCK_RAW) break; if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) { struct udp_sock *up = udp_sk(sk); if (up->pending == AF_INET6) { retv = -EBUSY; break; } } else if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_prot != &tcpv6_prot) { retv = -EBUSY; break; } } else { break; } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; } if (ipv6_only_sock(sk) || !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); /* * Sock is moving from IPv6 to IPv4 (sk_prot), so * remove it from the refcnt debug socks count in the * original family... */ sk_refcnt_debug_dec(sk); if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } /* Disable all options not to allocate memory anymore, * but there is still a race. See the lockless path * in udpv6_sendmsg() and ipv6_local_rxpmtu(). */ np->rxopt.all = 0; inet6_cleanup_sock(sk); /* * ... and add it to the refcnt debug socks count * in the new family. -acme */ sk_refcnt_debug_inc(sk); module_put(THIS_MODULE); retv = 0; break; } goto e_inval; case IPV6_V6ONLY: if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; sk->sk_ipv6only = valbool; retv = 0; break; case IPV6_RECVPKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_2292PKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxoinfo = valbool; retv = 0; break; case IPV6_RECVHOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_2292HOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxohlim = valbool; retv = 0; break; case IPV6_RECVRTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.srcrt = valbool; retv = 0; break; case IPV6_2292RTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.osrcrt = valbool; retv = 0; break; case IPV6_RECVHOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_2292HOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.ohopopts = valbool; retv = 0; break; case IPV6_RECVDSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_2292DSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.odstopts = valbool; retv = 0; break; case IPV6_TCLASS: if (optlen < sizeof(int)) goto e_inval; if (val < -1 || val > 0xff) goto e_inval; /* RFC 3542, 6.5: default traffic class of 0x0 */ if (val == -1) val = 0; np->tclass = val; retv = 0; break; case IPV6_RECVTCLASS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxtclass = valbool; retv = 0; break; case IPV6_FLOWINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxflow = valbool; retv = 0; break; case IPV6_RECVPATHMTU: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxpmtu = valbool; retv = 0; break; case IPV6_TRANSPARENT: if (valbool && !ns_capable(net->user_ns, CAP_NET_RAW) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ inet_sk(sk)->transparent = valbool; retv = 0; break; case IPV6_FREEBIND: if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ inet_sk(sk)->freebind = valbool; retv = 0; break; case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxorigdstaddr = valbool; retv = 0; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; case IPV6_PKTINFO: { struct in6_pktinfo pkt; if (optlen == 0) goto e_inval; else if (optlen < sizeof(struct in6_pktinfo) || sockptr_is_null(optval)) goto e_inval; if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { retv = -EFAULT; break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } case IPV6_2292PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; /* 1K is probably excessive * 1K is surely not enough, 2K per standard header is 16K. */ retv = -EINVAL; if (optlen > 64*1024) break; opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; if (!opt) break; memset(opt, 0, sizeof(*opt)); refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; msg.msg_control = (void *)(opt+1); ipc6.opt = opt; retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: retv = 0; opt = ipv6_update_options(sk, opt); done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } break; } case IPV6_UNICAST_HOPS: if (optlen < sizeof(int)) goto e_inval; if (val > 255 || val < -1) goto e_inval; np->hop_limit = val; retv = 0; break; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) break; if (optlen < sizeof(int)) goto e_inval; if (val > 255 || val < -1) goto e_inval; np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); retv = 0; break; case IPV6_MULTICAST_LOOP: if (optlen < sizeof(int)) goto e_inval; if (val != valbool) goto e_inval; np->mc_loop = valbool; retv = 0; break; case IPV6_ |