| 133 133 121 130 105 105 105 105 105 105 105 107 107 107 134 133 105 107 107 107 134 134 122 122 122 84 84 84 84 84 122 122 122 122 122 122 1 1 122 122 84 69 122 1 1 122 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 | // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/page_io.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, * Asynchronous swapping added 30.12.95. Stephen Tweedie * Removed race in async swapping. 14.4.1996. Bruno Haible * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman */ #include <linux/mm.h> #include <linux/kernel_stat.h> #include <linux/gfp.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/swapops.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/psi.h> #include <linux/uio.h> #include <linux/sched/task.h> #include <linux/delayacct.h> #include <linux/zswap.h> #include "swap.h" static void __end_swap_bio_write(struct bio *bio) { struct folio *folio = bio_first_folio_all(bio); if (bio->bi_status) { /* * We failed to write the page out to swap-space. * Re-dirty the page in order to avoid it being reclaimed. * Also print a dire warning that things will go BAD (tm) * very quickly. * * Also clear PG_reclaim to avoid folio_rotate_reclaimable() */ folio_mark_dirty(folio); pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); folio_clear_reclaim(folio); } folio_end_writeback(folio); } static void end_swap_bio_write(struct bio *bio) { __end_swap_bio_write(bio); bio_put(bio); } static void __end_swap_bio_read(struct bio *bio) { struct folio *folio = bio_first_folio_all(bio); if (bio->bi_status) { pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } else { folio_mark_uptodate(folio); } folio_unlock(folio); } static void end_swap_bio_read(struct bio *bio) { __end_swap_bio_read(bio); bio_put(bio); } int generic_swapfile_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; unsigned blocks_per_page; unsigned long page_no; unsigned blkbits; sector_t probe_block; sector_t last_block; sector_t lowest_block = -1; sector_t highest_block = 0; int nr_extents = 0; int ret; blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits; /* * Map all the blocks into the extent tree. This code doesn't try * to be very smart. */ probe_block = 0; page_no = 0; last_block = i_size_read(inode) >> blkbits; while ((probe_block + blocks_per_page) <= last_block && page_no < sis->max) { unsigned block_in_page; sector_t first_block; cond_resched(); first_block = probe_block; ret = bmap(inode, &first_block); if (ret || !first_block) goto bad_bmap; /* * It must be PAGE_SIZE aligned on-disk */ if (first_block & (blocks_per_page - 1)) { probe_block++; goto reprobe; } for (block_in_page = 1; block_in_page < blocks_per_page; block_in_page++) { sector_t block; block = probe_block + block_in_page; ret = bmap(inode, &block); if (ret || !block) goto bad_bmap; if (block != first_block + block_in_page) { /* Discontiguity */ probe_block++; goto reprobe; } } first_block >>= (PAGE_SHIFT - blkbits); if (page_no) { /* exclude the header page */ if (first_block < lowest_block) lowest_block = first_block; if (first_block > highest_block) highest_block = first_block; } /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ ret = add_swap_extent(sis, page_no, 1, first_block); if (ret < 0) goto out; nr_extents += ret; page_no++; probe_block += blocks_per_page; reprobe: continue; } ret = nr_extents; *span = 1 + highest_block - lowest_block; if (page_no == 0) page_no = 1; /* force Empty message */ sis->max = page_no; sis->pages = page_no - 1; out: return ret; bad_bmap: pr_err("swapon: swapfile has holes\n"); ret = -EINVAL; goto out; } static bool is_folio_zero_filled(struct folio *folio) { unsigned int pos, last_pos; unsigned long *data; unsigned int i; last_pos = PAGE_SIZE / sizeof(*data) - 1; for (i = 0; i < folio_nr_pages(folio); i++) { data = kmap_local_folio(folio, i * PAGE_SIZE); /* * Check last word first, incase the page is zero-filled at * the start and has non-zero data at the end, which is common * in real-world workloads. */ if (data[last_pos]) { kunmap_local(data); return false; } for (pos = 0; pos < last_pos; pos++) { if (data[pos]) { kunmap_local(data); return false; } } kunmap_local(data); } return true; } static void swap_zeromap_folio_set(struct folio *folio) { struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio); struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); int nr_pages = folio_nr_pages(folio); swp_entry_t entry; unsigned int i; for (i = 0; i < folio_nr_pages(folio); i++) { entry = page_swap_entry(folio_page(folio, i)); set_bit(swp_offset(entry), sis->zeromap); } count_vm_events(SWPOUT_ZERO, nr_pages); if (objcg) { count_objcg_events(objcg, SWPOUT_ZERO, nr_pages); obj_cgroup_put(objcg); } } static void swap_zeromap_folio_clear(struct folio *folio) { struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); swp_entry_t entry; unsigned int i; for (i = 0; i < folio_nr_pages(folio); i++) { entry = page_swap_entry(folio_page(folio, i)); clear_bit(swp_offset(entry), sis->zeromap); } } /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) { int ret = 0; if (folio_free_swap(folio)) goto out_unlock; /* * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. */ ret = arch_prepare_to_swap(folio); if (ret) { folio_mark_dirty(folio); goto out_unlock; } /* * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages. * The bits in zeromap are protected by the locked swapcache folio * and atomic updates are used to protect against read-modify-write * corruption due to other zero swap entries seeing concurrent updates. */ if (is_folio_zero_filled(folio)) { swap_zeromap_folio_set(folio); goto out_unlock; } /* * Clear bits this folio occupies in the zeromap to prevent zero data * being read in from any previous zero writes that occupied the same * swap entries. */ swap_zeromap_folio_clear(folio); if (zswap_store(folio)) { count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); goto out_unlock; } if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) { folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; } __swap_writepage(folio, swap_plug); return 0; out_unlock: folio_unlock(folio); return ret; } static inline void count_swpout_vm_event(struct folio *folio) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (unlikely(folio_test_pmd_mappable(folio))) { count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } #endif count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio)); count_vm_events(PSWPOUT, folio_nr_pages(folio)); } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) { struct cgroup_subsys_state *css; struct mem_cgroup *memcg; memcg = folio_memcg(folio); if (!memcg) return; rcu_read_lock(); css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } #else #define bio_associate_blkg_from_page(bio, folio) do { } while (0) #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ struct swap_iocb { struct kiocb iocb; struct bio_vec bvec[SWAP_CLUSTER_MAX]; int pages; int len; }; static mempool_t *sio_pool; int sio_pool_init(void) { if (!sio_pool) { mempool_t *pool = mempool_create_kmalloc_pool( SWAP_CLUSTER_MAX, sizeof(struct swap_iocb)); if (cmpxchg(&sio_pool, NULL, pool)) mempool_destroy(pool); } if (!sio_pool) return -ENOMEM; return 0; } static void sio_write_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); struct page *page = sio->bvec[0].bv_page; int p; if (ret != sio->len) { /* * In the case of swap-over-nfs, this can be a * temporary failure if the system has limited * memory for allocating transmit buffers. * Mark the page dirty and avoid * folio_rotate_reclaimable but rate-limit the * messages. */ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", ret, swap_dev_pos(page_swap_entry(page))); for (p = 0; p < sio->pages; p++) { page = sio->bvec[p].bv_page; set_page_dirty(page); ClearPageReclaim(page); } } for (p = 0; p < sio->pages; p++) end_page_writeback(sio->bvec[p].bv_page); mempool_free(sio, sio_pool); } static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_iocb *sio = swap_plug ? *swap_plug : NULL; struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct file *swap_file = sis->swap_file; loff_t pos = swap_dev_pos(folio->swap); count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); if (sio) { if (sio->iocb.ki_filp != swap_file || sio->iocb.ki_pos + sio->len != pos) { swap_write_unplug(sio); sio = NULL; } } if (!sio) { sio = mempool_alloc(sio_pool, GFP_NOIO); init_sync_kiocb(&sio->iocb, swap_file); sio->iocb.ki_complete = sio_write_complete; sio->iocb.ki_pos = pos; sio->pages = 0; sio->len = 0; } bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); sio->len += folio_size(folio); sio->pages += 1; if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) { swap_write_unplug(sio); sio = NULL; } if (swap_plug) *swap_plug = sio; } static void swap_writepage_bdev_sync(struct folio *folio, struct swap_info_struct *sis) { struct bio_vec bv; struct bio bio; bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP); bio.bi_iter.bi_sector = swap_folio_sector(folio); bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); bio_associate_blkg_from_page(&bio, folio); count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); submit_bio_wait(&bio); __end_swap_bio_write(&bio); } static void swap_writepage_bdev_async(struct folio *folio, struct swap_info_struct *sis) { struct bio *bio; bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO); bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_write; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); bio_associate_blkg_from_page(bio, folio); count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); submit_bio(bio); } void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); /* * ->flags can be updated non-atomicially (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ if (data_race(sis->flags & SWP_FS_OPS)) swap_writepage_fs(folio, swap_plug); /* * ->flags can be updated non-atomicially (scan_swap_map_slots), * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race * is safe. */ else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO)) swap_writepage_bdev_sync(folio, sis); else swap_writepage_bdev_async(folio, sis); } void swap_write_unplug(struct swap_iocb *sio) { struct iov_iter from; struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); } static void sio_read_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); int p; if (ret == sio->len) { for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); folio_mark_uptodate(folio); folio_unlock(folio); } count_vm_events(PSWPIN, sio->pages); } else { for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); folio_unlock(folio); } pr_alert_ratelimited("Read-error on swap-device\n"); } mempool_free(sio, sio_pool); } static bool swap_read_folio_zeromap(struct folio *folio) { int nr_pages = folio_nr_pages(folio); struct obj_cgroup *objcg; bool is_zeromap; /* * Swapping in a large folio that is partially in the zeromap is not * currently handled. Return true without marking the folio uptodate so * that an IO error is emitted (e.g. do_swap_page() will sigbus). */ if (WARN_ON_ONCE(swap_zeromap_batch(folio->swap, nr_pages, &is_zeromap) != nr_pages)) return true; if (!is_zeromap) return false; objcg = get_obj_cgroup_from_folio(folio); count_vm_events(SWPIN_ZERO, nr_pages); if (objcg) { count_objcg_events(objcg, SWPIN_ZERO, nr_pages); obj_cgroup_put(objcg); } folio_zero_range(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); return true; } static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_iocb *sio = NULL; loff_t pos = swap_dev_pos(folio->swap); if (plug) sio = *plug; if (sio) { if (sio->iocb.ki_filp != sis->swap_file || sio->iocb.ki_pos + sio->len != pos) { swap_read_unplug(sio); sio = NULL; } } if (!sio) { sio = mempool_alloc(sio_pool, GFP_KERNEL); init_sync_kiocb(&sio->iocb, sis->swap_file); sio->iocb.ki_pos = pos; sio->iocb.ki_complete = sio_read_complete; sio->pages = 0; sio->len = 0; } bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); sio->len += folio_size(folio); sio->pages += 1; if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { swap_read_unplug(sio); sio = NULL; } if (plug) *plug = sio; } static void swap_read_folio_bdev_sync(struct folio *folio, struct swap_info_struct *sis) { struct bio_vec bv; struct bio bio; bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); bio.bi_iter.bi_sector = swap_folio_sector(folio); bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. */ get_task_struct(current); count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio_wait(&bio); __end_swap_bio_read(&bio); put_task_struct(current); } static void swap_read_folio_bdev_async(struct folio *folio, struct swap_info_struct *sis) { struct bio *bio; bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio(bio); } void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO; bool workingset = folio_test_workingset(folio); unsigned long pflags; bool in_thrashing; VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); /* * Count submission time as memory stall and delay. When the device * is congested, or the submitting cgroup IO-throttled, submission * can be a significant part of overall IO time. */ if (workingset) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); } delayacct_swapin_start(); if (swap_read_folio_zeromap(folio)) { folio_unlock(folio); goto finish; } if (zswap_load(folio) != -ENOENT) goto finish; /* We have to read from slower devices. Increase zswap protection. */ zswap_folio_swapin(folio); if (data_race(sis->flags & SWP_FS_OPS)) { swap_read_folio_fs(folio, plug); } else if (synchronous) { swap_read_folio_bdev_sync(folio, sis); } else { swap_read_folio_bdev_async(folio, sis); } finish: if (workingset) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } delayacct_swapin_end(); } void __swap_read_unplug(struct swap_iocb *sio) { struct iov_iter from; struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_read_complete(&sio->iocb, ret); } |
| 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ToupTek UCMOS / AmScope MU series camera driver * TODO: contrast with ScopeTek / AmScope MDC cameras * * Copyright (C) 2012-2014 John McMaster <JohnDMcMaster@gmail.com> * * Special thanks to Bushing for helping with the decrypt algorithm and * Sean O'Sullivan / the Rensselaer Center for Open Source * Software (RCOS) for helping me learn kernel development */ #include "gspca.h" #define MODULE_NAME "touptek" MODULE_AUTHOR("John McMaster"); MODULE_DESCRIPTION("ToupTek UCMOS / Amscope MU microscope camera driver"); MODULE_LICENSE("GPL"); /* * Exposure reg is linear with exposure time * Exposure (sec), E (reg) * 0.000400, 0x0002 * 0.001000, 0x0005 * 0.005000, 0x0019 * 0.020000, 0x0064 * 0.080000, 0x0190 * 0.400000, 0x07D0 * 1.000000, 0x1388 * 2.000000, 0x2710 * * Three gain stages * 0x1000: master channel enable bit * 0x007F: low gain bits * 0x0080: medium gain bit * 0x0100: high gain bit * gain = enable * (1 + regH) * (1 + regM) * z * regL * * Gain implementation * Want to do something similar to mt9v011.c's set_balance * * Gain does not vary with resolution (checked 640x480 vs 1600x1200) * * Constant derivation: * * Raw data: * Gain, GTOP, B, R, GBOT * 1.00, 0x105C, 0x1068, 0x10C8, 0x105C * 1.20, 0x106E, 0x107E, 0x10D6, 0x106E * 1.40, 0x10C0, 0x10CA, 0x10E5, 0x10C0 * 1.60, 0x10C9, 0x10D4, 0x10F3, 0x10C9 * 1.80, 0x10D2, 0x10DE, 0x11C1, 0x10D2 * 2.00, 0x10DC, 0x10E9, 0x11C8, 0x10DC * 2.20, 0x10E5, 0x10F3, 0x11CF, 0x10E5 * 2.40, 0x10EE, 0x10FE, 0x11D7, 0x10EE * 2.60, 0x10F7, 0x11C4, 0x11DE, 0x10F7 * 2.80, 0x11C0, 0x11CA, 0x11E5, 0x11C0 * 3.00, 0x11C5, 0x11CF, 0x11ED, 0x11C5 * * zR = 0.0069605943152454778 * about 3/431 = 0.0069605568445475635 * zB = 0.0095695970695970703 * about 6/627 = 0.0095693779904306216 * zG = 0.010889328063241107 * about 6/551 = 0.010889292196007259 * about 10 bits for constant + 7 bits for value => at least 17 bit * intermediate with 32 bit ints should be fine for overflow etc * Essentially gains are in range 0-0x001FF * * However, V4L expects a main gain channel + R and B balance * To keep things simple for now saturate the values of balance is too high/low * This isn't really ideal but easy way to fit the Linux model * * Converted using gain model turns out to be quite linear: * Gain, GTOP, B, R, GBOT * 1.00, 92, 104, 144, 92 * 1.20, 110, 126, 172, 110 * 1.40, 128, 148, 202, 128 * 1.60, 146, 168, 230, 146 * 1.80, 164, 188, 260, 164 * 2.00, 184, 210, 288, 184 * 2.20, 202, 230, 316, 202 * 2.40, 220, 252, 348, 220 * 2.60, 238, 272, 376, 238 * 2.80, 256, 296, 404, 256 * 3.00, 276, 316, 436, 276 * * Maximum gain is 0x7FF * 2 * 2 => 0x1FFC (8188) * or about 13 effective bits of gain * The highest the commercial driver goes in my setup 436 * However, because could *maybe* damage circuits * limit the gain until have a reason to go higher * Solution: gain clipped and warning emitted */ #define GAIN_MAX 511 /* Frame sync is a short read */ #define BULK_SIZE 0x4000 /* MT9E001 reg names to give a rough approximation */ #define REG_COARSE_INTEGRATION_TIME_ 0x3012 #define REG_GROUPED_PARAMETER_HOLD_ 0x3022 #define REG_MODE_SELECT 0x0100 #define REG_OP_SYS_CLK_DIV 0x030A #define REG_VT_SYS_CLK_DIV 0x0302 #define REG_PRE_PLL_CLK_DIV 0x0304 #define REG_VT_PIX_CLK_DIV 0x0300 #define REG_OP_PIX_CLK_DIV 0x0308 #define REG_PLL_MULTIPLIER 0x0306 #define REG_COARSE_INTEGRATION_TIME_ 0x3012 #define REG_FRAME_LENGTH_LINES 0x0340 #define REG_FRAME_LENGTH_LINES_ 0x300A #define REG_GREEN1_GAIN 0x3056 #define REG_GREEN2_GAIN 0x305C #define REG_GROUPED_PARAMETER_HOLD 0x0104 #define REG_LINE_LENGTH_PCK_ 0x300C #define REG_MODE_SELECT 0x0100 #define REG_PLL_MULTIPLIER 0x0306 #define REG_READ_MODE 0x3040 #define REG_BLUE_GAIN 0x3058 #define REG_RED_GAIN 0x305A #define REG_RESET_REGISTER 0x301A #define REG_SCALE_M 0x0404 #define REG_SCALING_MODE 0x0400 #define REG_SOFTWARE_RESET 0x0103 #define REG_X_ADDR_END 0x0348 #define REG_X_ADDR_START 0x0344 #define REG_X_ADDR_START 0x0344 #define REG_X_OUTPUT_SIZE 0x034C #define REG_Y_ADDR_END 0x034A #define REG_Y_ADDR_START 0x0346 #define REG_Y_OUTPUT_SIZE 0x034E /* specific webcam descriptor */ struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ /* How many bytes this frame */ unsigned int this_f; /* Device has separate gains for each Bayer quadrant V4L supports master gain which is referenced to G1/G2 and supplies individual balance controls for R/B */ struct v4l2_ctrl *blue; struct v4l2_ctrl *red; }; /* Used to simplify reg write error handling */ struct cmd { u16 value; u16 index; }; static const struct v4l2_pix_format vga_mode[] = { {800, 600, V4L2_PIX_FMT_SGRBG8, V4L2_FIELD_NONE, .bytesperline = 800, .sizeimage = 800 * 600, .colorspace = V4L2_COLORSPACE_SRGB}, {1600, 1200, V4L2_PIX_FMT_SGRBG8, V4L2_FIELD_NONE, .bytesperline = 1600, .sizeimage = 1600 * 1200, .colorspace = V4L2_COLORSPACE_SRGB}, {3264, 2448, V4L2_PIX_FMT_SGRBG8, V4L2_FIELD_NONE, .bytesperline = 3264, .sizeimage = 3264 * 2448, .colorspace = V4L2_COLORSPACE_SRGB}, }; /* * As there's no known frame sync, the only way to keep synced is to try hard * to never miss any packets */ #if MAX_NURBS < 4 #error "Not enough URBs in the gspca table" #endif static int val_reply(struct gspca_dev *gspca_dev, const char *reply, int rc) { if (rc < 0) { gspca_err(gspca_dev, "reply has error %d\n", rc); return -EIO; } if (rc != 1) { gspca_err(gspca_dev, "Bad reply size %d\n", rc); return -EIO; } if (reply[0] != 0x08) { gspca_err(gspca_dev, "Bad reply 0x%02x\n", (int)reply[0]); return -EIO; } return 0; } static void reg_w(struct gspca_dev *gspca_dev, u16 value, u16 index) { char *buff = gspca_dev->usb_buf; int rc; gspca_dbg(gspca_dev, D_USBO, "reg_w bReq=0x0B, bReqT=0xC0, wVal=0x%04X, wInd=0x%04X\n\n", value, index); rc = usb_control_msg(gspca_dev->dev, usb_rcvctrlpipe(gspca_dev->dev, 0), 0x0B, 0xC0, value, index, buff, 1, 500); gspca_dbg(gspca_dev, D_USBO, "rc=%d, ret={0x%02x}\n", rc, (int)buff[0]); if (rc < 0) { gspca_err(gspca_dev, "Failed reg_w(0x0B, 0xC0, 0x%04X, 0x%04X) w/ rc %d\n", value, index, rc); gspca_dev->usb_err = rc; return; } if (val_reply(gspca_dev, buff, rc)) { gspca_err(gspca_dev, "Bad reply to reg_w(0x0B, 0xC0, 0x%04X, 0x%04X\n", value, index); gspca_dev->usb_err = -EIO; } } static void reg_w_buf(struct gspca_dev *gspca_dev, const struct cmd *p, int l) { do { reg_w(gspca_dev, p->value, p->index); p++; } while (--l > 0); } static void setexposure(struct gspca_dev *gspca_dev, s32 val) { u16 value; unsigned int w = gspca_dev->pixfmt.width; if (w == 800) value = val * 5; else if (w == 1600) value = val * 3; else if (w == 3264) value = val * 3 / 2; else { gspca_err(gspca_dev, "Invalid width %u\n", w); gspca_dev->usb_err = -EINVAL; return; } gspca_dbg(gspca_dev, D_STREAM, "exposure: 0x%04X ms\n\n", value); /* Wonder if there's a good reason for sending it twice */ /* probably not but leave it in because...why not */ reg_w(gspca_dev, value, REG_COARSE_INTEGRATION_TIME_); reg_w(gspca_dev, value, REG_COARSE_INTEGRATION_TIME_); } static int gainify(int in) { /* * TODO: check if there are any issues with corner cases * 0x000 (0):0x07F (127): regL * 0x080 (128) - 0x0FF (255): regM, regL * 0x100 (256) - max: regH, regM, regL */ if (in <= 0x7F) return 0x1000 | in; else if (in <= 0xFF) return 0x1080 | in / 2; else return 0x1180 | in / 4; } static void setggain(struct gspca_dev *gspca_dev, u16 global_gain) { u16 normalized; normalized = gainify(global_gain); gspca_dbg(gspca_dev, D_STREAM, "gain G1/G2 (0x%04X): 0x%04X (src 0x%04X)\n\n", REG_GREEN1_GAIN, normalized, global_gain); reg_w(gspca_dev, normalized, REG_GREEN1_GAIN); reg_w(gspca_dev, normalized, REG_GREEN2_GAIN); } static void setbgain(struct gspca_dev *gspca_dev, u16 gain, u16 global_gain) { u16 normalized; normalized = global_gain + ((u32)global_gain) * gain / GAIN_MAX; if (normalized > GAIN_MAX) { gspca_dbg(gspca_dev, D_STREAM, "Truncating blue 0x%04X w/ value 0x%04X\n\n", GAIN_MAX, normalized); normalized = GAIN_MAX; } normalized = gainify(normalized); gspca_dbg(gspca_dev, D_STREAM, "gain B (0x%04X): 0x%04X w/ source 0x%04X\n\n", REG_BLUE_GAIN, normalized, gain); reg_w(gspca_dev, normalized, REG_BLUE_GAIN); } static void setrgain(struct gspca_dev *gspca_dev, u16 gain, u16 global_gain) { u16 normalized; normalized = global_gain + ((u32)global_gain) * gain / GAIN_MAX; if (normalized > GAIN_MAX) { gspca_dbg(gspca_dev, D_STREAM, "Truncating gain 0x%04X w/ value 0x%04X\n\n", GAIN_MAX, normalized); normalized = GAIN_MAX; } normalized = gainify(normalized); gspca_dbg(gspca_dev, D_STREAM, "gain R (0x%04X): 0x%04X w / source 0x%04X\n\n", REG_RED_GAIN, normalized, gain); reg_w(gspca_dev, normalized, REG_RED_GAIN); } static void configure_wh(struct gspca_dev *gspca_dev) { unsigned int w = gspca_dev->pixfmt.width; gspca_dbg(gspca_dev, D_STREAM, "configure_wh\n\n"); if (w == 800) { static const struct cmd reg_init_res[] = { {0x0060, REG_X_ADDR_START}, {0x0CD9, REG_X_ADDR_END}, {0x0036, REG_Y_ADDR_START}, {0x098F, REG_Y_ADDR_END}, {0x07C7, REG_READ_MODE}, }; reg_w_buf(gspca_dev, reg_init_res, ARRAY_SIZE(reg_init_res)); } else if (w == 1600) { static const struct cmd reg_init_res[] = { {0x009C, REG_X_ADDR_START}, {0x0D19, REG_X_ADDR_END}, {0x0068, REG_Y_ADDR_START}, {0x09C5, REG_Y_ADDR_END}, {0x06C3, REG_READ_MODE}, }; reg_w_buf(gspca_dev, reg_init_res, ARRAY_SIZE(reg_init_res)); } else if (w == 3264) { static const struct cmd reg_init_res[] = { {0x00E8, REG_X_ADDR_START}, {0x0DA7, REG_X_ADDR_END}, {0x009E, REG_Y_ADDR_START}, {0x0A2D, REG_Y_ADDR_END}, {0x0241, REG_READ_MODE}, }; reg_w_buf(gspca_dev, reg_init_res, ARRAY_SIZE(reg_init_res)); } else { gspca_err(gspca_dev, "bad width %u\n", w); gspca_dev->usb_err = -EINVAL; return; } reg_w(gspca_dev, 0x0000, REG_SCALING_MODE); reg_w(gspca_dev, 0x0010, REG_SCALE_M); reg_w(gspca_dev, w, REG_X_OUTPUT_SIZE); reg_w(gspca_dev, gspca_dev->pixfmt.height, REG_Y_OUTPUT_SIZE); if (w == 800) { reg_w(gspca_dev, 0x0384, REG_FRAME_LENGTH_LINES_); reg_w(gspca_dev, 0x0960, REG_LINE_LENGTH_PCK_); } else if (w == 1600) { reg_w(gspca_dev, 0x0640, REG_FRAME_LENGTH_LINES_); reg_w(gspca_dev, 0x0FA0, REG_LINE_LENGTH_PCK_); } else if (w == 3264) { reg_w(gspca_dev, 0x0B4B, REG_FRAME_LENGTH_LINES_); reg_w(gspca_dev, 0x1F40, REG_LINE_LENGTH_PCK_); } else { gspca_err(gspca_dev, "bad width %u\n", w); gspca_dev->usb_err = -EINVAL; return; } } /* Packets that were encrypted, no idea if the grouping is significant */ static void configure_encrypted(struct gspca_dev *gspca_dev) { static const struct cmd reg_init_begin[] = { {0x0100, REG_SOFTWARE_RESET}, {0x0000, REG_MODE_SELECT}, {0x0100, REG_GROUPED_PARAMETER_HOLD}, {0x0004, REG_VT_PIX_CLK_DIV}, {0x0001, REG_VT_SYS_CLK_DIV}, {0x0008, REG_OP_PIX_CLK_DIV}, {0x0001, REG_OP_SYS_CLK_DIV}, {0x0004, REG_PRE_PLL_CLK_DIV}, {0x0040, REG_PLL_MULTIPLIER}, {0x0000, REG_GROUPED_PARAMETER_HOLD}, {0x0100, REG_GROUPED_PARAMETER_HOLD}, }; static const struct cmd reg_init_end[] = { {0x0000, REG_GROUPED_PARAMETER_HOLD}, {0x0301, 0x31AE}, {0x0805, 0x3064}, {0x0071, 0x3170}, {0x10DE, REG_RESET_REGISTER}, {0x0000, REG_MODE_SELECT}, {0x0010, REG_PLL_MULTIPLIER}, {0x0100, REG_MODE_SELECT}, }; gspca_dbg(gspca_dev, D_STREAM, "Encrypted begin, w = %u\n\n", gspca_dev->pixfmt.width); reg_w_buf(gspca_dev, reg_init_begin, ARRAY_SIZE(reg_init_begin)); configure_wh(gspca_dev); reg_w_buf(gspca_dev, reg_init_end, ARRAY_SIZE(reg_init_end)); reg_w(gspca_dev, 0x0100, REG_GROUPED_PARAMETER_HOLD); reg_w(gspca_dev, 0x0000, REG_GROUPED_PARAMETER_HOLD); gspca_dbg(gspca_dev, D_STREAM, "Encrypted end\n\n"); } static int configure(struct gspca_dev *gspca_dev) { int rc; char *buff = gspca_dev->usb_buf; gspca_dbg(gspca_dev, D_STREAM, "configure()\n\n"); /* * First driver sets a sort of encryption key * A number of futur requests of this type have wValue and wIndex * encrypted as follows: * -Compute key = this wValue rotate left by 4 bits * (decrypt.py rotates right because we are decrypting) * -Later packets encrypt packets by XOR'ing with key * XOR encrypt/decrypt is symmetrical * wValue, and wIndex are encrypted * bRequest is not and bRequestType is always 0xC0 * This allows resyncing if key is unknown? * By setting 0 we XOR with 0 and the shifting and XOR drops out */ rc = usb_control_msg(gspca_dev->dev, usb_rcvctrlpipe(gspca_dev->dev, 0), 0x16, 0xC0, 0x0000, 0x0000, buff, 2, 500); if (val_reply(gspca_dev, buff, rc)) { gspca_err(gspca_dev, "failed key req\n"); return -EIO; } /* * Next does some sort of 2 packet challenge / response * evidence suggests its an Atmel I2C crypto part but nobody cares to * look * (to make sure its not cloned hardware?) * Ignore: I want to work with their hardware, not clone it * 16 bytes out challenge, requestType: 0x40 * 16 bytes in response, requestType: 0xC0 */ rc = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0x01, 0x40, 0x0001, 0x000F, NULL, 0, 500); if (rc < 0) { gspca_err(gspca_dev, "failed to replay packet 176 w/ rc %d\n", rc); return rc; } rc = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0x01, 0x40, 0x0000, 0x000F, NULL, 0, 500); if (rc < 0) { gspca_err(gspca_dev, "failed to replay packet 178 w/ rc %d\n", rc); return rc; } rc = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0x01, 0x40, 0x0001, 0x000F, NULL, 0, 500); if (rc < 0) { gspca_err(gspca_dev, "failed to replay packet 180 w/ rc %d\n", rc); return rc; } /* * Serial number? Doesn't seem to be required * cam1: \xE6\x0D\x00\x00, cam2: \x70\x19\x00\x00 * rc = usb_control_msg(gspca_dev->dev, * usb_rcvctrlpipe(gspca_dev->dev, 0), * 0x20, 0xC0, 0x0000, 0x0000, buff, 4, 500); */ /* Large (EEPROM?) read, skip it since no idea what to do with it */ gspca_dev->usb_err = 0; configure_encrypted(gspca_dev); if (gspca_dev->usb_err) return gspca_dev->usb_err; /* Omitted this by accident, does not work without it */ rc = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0x01, 0x40, 0x0003, 0x000F, NULL, 0, 500); if (rc < 0) { gspca_err(gspca_dev, "failed to replay final packet w/ rc %d\n", rc); return rc; } gspca_dbg(gspca_dev, D_STREAM, "Configure complete\n\n"); return 0; } static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { gspca_dev->cam.cam_mode = vga_mode; gspca_dev->cam.nmodes = ARRAY_SIZE(vga_mode); /* Yes we want URBs and we want them now! */ gspca_dev->cam.no_urb_create = 0; gspca_dev->cam.bulk_nurbs = 4; /* Largest size the windows driver uses */ gspca_dev->cam.bulk_size = BULK_SIZE; /* Def need to use bulk transfers */ gspca_dev->cam.bulk = 1; return 0; } static int sd_start(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; int rc; sd->this_f = 0; rc = configure(gspca_dev); if (rc < 0) { gspca_err(gspca_dev, "Failed configure\n"); return rc; } /* First two frames have messed up gains Drop them to avoid special cases in user apps? */ return 0; } static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { struct sd *sd = (struct sd *) gspca_dev; if (len != BULK_SIZE) { /* can we finish a frame? */ if (sd->this_f + len == gspca_dev->pixfmt.sizeimage) { gspca_frame_add(gspca_dev, LAST_PACKET, data, len); gspca_dbg(gspca_dev, D_FRAM, "finish frame sz %u/%u w/ len %u\n\n", sd->this_f, gspca_dev->pixfmt.sizeimage, len); /* lost some data, discard the frame */ } else { gspca_frame_add(gspca_dev, DISCARD_PACKET, NULL, 0); gspca_dbg(gspca_dev, D_FRAM, "abort frame sz %u/%u w/ len %u\n\n", sd->this_f, gspca_dev->pixfmt.sizeimage, len); } sd->this_f = 0; } else { if (sd->this_f == 0) gspca_frame_add(gspca_dev, FIRST_PACKET, data, len); else gspca_frame_add(gspca_dev, INTER_PACKET, data, len); sd->this_f += len; } } static int sd_init(struct gspca_dev *gspca_dev) { return 0; } static int sd_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); struct sd *sd = (struct sd *) gspca_dev; gspca_dev->usb_err = 0; if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_EXPOSURE: setexposure(gspca_dev, ctrl->val); break; case V4L2_CID_GAIN: /* gspca_dev->gain automatically updated */ setggain(gspca_dev, gspca_dev->gain->val); break; case V4L2_CID_BLUE_BALANCE: sd->blue->val = ctrl->val; setbgain(gspca_dev, sd->blue->val, gspca_dev->gain->val); break; case V4L2_CID_RED_BALANCE: sd->red->val = ctrl->val; setrgain(gspca_dev, sd->red->val, gspca_dev->gain->val); break; } return gspca_dev->usb_err; } static const struct v4l2_ctrl_ops sd_ctrl_ops = { .s_ctrl = sd_s_ctrl, }; static int sd_init_controls(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; struct v4l2_ctrl_handler *hdl = &gspca_dev->ctrl_handler; gspca_dev->vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 4); gspca_dev->exposure = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, /* Mostly limited by URB timeouts */ /* XXX: make dynamic based on frame rate? */ V4L2_CID_EXPOSURE, 0, 800, 1, 350); gspca_dev->gain = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_GAIN, 0, 511, 1, 128); sd->blue = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BLUE_BALANCE, 0, 1023, 1, 80); sd->red = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_RED_BALANCE, 0, 1023, 1, 295); if (hdl->error) { gspca_err(gspca_dev, "Could not initialize controls\n"); return hdl->error; } return 0; } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .init_controls = sd_init_controls, .start = sd_start, .pkt_scan = sd_pkt_scan, }; /* Table of supported USB devices */ static const struct usb_device_id device_table[] = { /* Commented out devices should be related */ /* AS: AmScope, TT: ToupTek */ /* { USB_DEVICE(0x0547, 0x6035) }, TT UCMOS00350KPA */ /* { USB_DEVICE(0x0547, 0x6130) }, TT UCMOS01300KPA */ /* { USB_DEVICE(0x0547, 0x6200) }, TT UCMOS02000KPA */ /* { USB_DEVICE(0x0547, 0x6310) }, TT UCMOS03100KPA */ /* { USB_DEVICE(0x0547, 0x6510) }, TT UCMOS05100KPA */ /* { USB_DEVICE(0x0547, 0x6800) }, TT UCMOS08000KPA */ /* { USB_DEVICE(0x0547, 0x6801) }, TT UCMOS08000KPB */ { USB_DEVICE(0x0547, 0x6801) }, /* TT UCMOS08000KPB, AS MU800 */ /* { USB_DEVICE(0x0547, 0x6900) }, TT UCMOS09000KPA */ /* { USB_DEVICE(0x0547, 0x6901) }, TT UCMOS09000KPB */ /* { USB_DEVICE(0x0547, 0x6010) }, TT UCMOS10000KPA */ /* { USB_DEVICE(0x0547, 0x6014) }, TT UCMOS14000KPA */ /* { USB_DEVICE(0x0547, 0x6131) }, TT UCMOS01300KMA */ /* { USB_DEVICE(0x0547, 0x6511) }, TT UCMOS05100KMA */ /* { USB_DEVICE(0x0547, 0x8080) }, TT UHCCD00800KPA */ /* { USB_DEVICE(0x0547, 0x8140) }, TT UHCCD01400KPA */ /* { USB_DEVICE(0x0547, 0x8141) }, TT EXCCD01400KPA */ /* { USB_DEVICE(0x0547, 0x8200) }, TT UHCCD02000KPA */ /* { USB_DEVICE(0x0547, 0x8201) }, TT UHCCD02000KPB */ /* { USB_DEVICE(0x0547, 0x8310) }, TT UHCCD03100KPA */ /* { USB_DEVICE(0x0547, 0x8500) }, TT UHCCD05000KPA */ /* { USB_DEVICE(0x0547, 0x8510) }, TT UHCCD05100KPA */ /* { USB_DEVICE(0x0547, 0x8600) }, TT UHCCD06000KPA */ /* { USB_DEVICE(0x0547, 0x8800) }, TT UHCCD08000KPA */ /* { USB_DEVICE(0x0547, 0x8315) }, TT UHCCD03150KPA */ /* { USB_DEVICE(0x0547, 0x7800) }, TT UHCCD00800KMA */ /* { USB_DEVICE(0x0547, 0x7140) }, TT UHCCD01400KMA */ /* { USB_DEVICE(0x0547, 0x7141) }, TT UHCCD01400KMB */ /* { USB_DEVICE(0x0547, 0x7200) }, TT UHCCD02000KMA */ /* { USB_DEVICE(0x0547, 0x7315) }, TT UHCCD03150KMA */ { } }; MODULE_DEVICE_TABLE(usb, device_table); static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, #endif }; static int __init sd_mod_init(void) { int ret; ret = usb_register(&sd_driver); if (ret < 0) return ret; return 0; } static void __exit sd_mod_exit(void) { usb_deregister(&sd_driver); } module_init(sd_mod_init); module_exit(sd_mod_exit); |
| 235 235 235 234 235 235 235 376 378 377 378 32 74 72 32 72 413 299 72 296 2 2 2 2 410 198 197 52 198 407 367 155 366 13 13 14 14 14 14 1 14 14 13 14 14 14 14 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2002 International Business Machines, Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * This abstraction represents an SCTP endpoint. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Dajiang Zhang <dajiang.zhang@nokia.com> */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/random.h> /* get_random_bytes() */ #include <net/sock.h> #include <net/ipv6.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ static void sctp_endpoint_bh_rcv(struct work_struct *work); static void gen_cookie_auth_key(struct hmac_sha256_key *key) { u8 raw_key[SCTP_COOKIE_KEY_SIZE]; get_random_bytes(raw_key, sizeof(raw_key)); hmac_sha256_preparekey(key, raw_key, sizeof(raw_key)); memzero_explicit(raw_key, sizeof(raw_key)); } /* * Initialize the base fields of the endpoint structure. */ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct sock *sk, gfp_t gfp) { struct net *net = sock_net(sk); struct sctp_shared_key *null_key; ep->asconf_enable = net->sctp.addip_enable; ep->auth_enable = net->sctp.auth_enable; if (ep->auth_enable) { if (sctp_auth_init(ep, gfp)) goto nomem; if (ep->asconf_enable) { sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK); } } /* Initialize the base structure. */ /* What type of endpoint are we? */ ep->base.type = SCTP_EP_TYPE_SOCKET; /* Initialize the basic object fields. */ refcount_set(&ep->base.refcnt, 1); ep->base.dead = false; /* Create an input queue. */ sctp_inq_init(&ep->base.inqueue); /* Set its top-half handler */ sctp_inq_set_th_handler(&ep->base.inqueue, sctp_endpoint_bh_rcv); /* Initialize the bind addr area */ sctp_bind_addr_init(&ep->base.bind_addr, 0); /* Create the lists of associations. */ INIT_LIST_HEAD(&ep->asocs); /* Use SCTP specific send buffer space queues. */ ep->sndbuf_policy = net->sctp.sndbuf_policy; sk->sk_data_ready = sctp_data_ready; sk->sk_write_space = sctp_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); /* Get the receive buffer policy for this endpoint */ ep->rcvbuf_policy = net->sctp.rcvbuf_policy; /* Generate the cookie authentication key. */ gen_cookie_auth_key(&ep->cookie_auth_key); /* SCTP-AUTH extensions*/ INIT_LIST_HEAD(&ep->endpoint_shared_keys); null_key = sctp_auth_shkey_create(0, gfp); if (!null_key) goto nomem_shkey; list_add(&null_key->key_list, &ep->endpoint_shared_keys); /* Add the null key to the endpoint shared keys list and * set the hmcas and chunks pointers. */ ep->prsctp_enable = net->sctp.prsctp_enable; ep->reconf_enable = net->sctp.reconf_enable; ep->ecn_enable = net->sctp.ecn_enable; /* Remember who we are attached to. */ ep->base.sk = sk; ep->base.net = sock_net(sk); sock_hold(ep->base.sk); return ep; nomem_shkey: sctp_auth_free(ep); nomem: return NULL; } /* Create a sctp_endpoint with all that boring stuff initialized. * Returns NULL if there isn't enough memory. */ struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp) { struct sctp_endpoint *ep; /* Build a local endpoint. */ ep = kzalloc(sizeof(*ep), gfp); if (!ep) goto fail; if (!sctp_endpoint_init(ep, sk, gfp)) goto fail_init; SCTP_DBG_OBJCNT_INC(ep); return ep; fail_init: kfree(ep); fail: return NULL; } /* Add an association to an endpoint. */ void sctp_endpoint_add_asoc(struct sctp_endpoint *ep, struct sctp_association *asoc) { struct sock *sk = ep->base.sk; /* If this is a temporary association, don't bother * since we'll be removing it shortly and don't * want anyone to find it anyway. */ if (asoc->temp) return; /* Now just add it to our list of asocs */ list_add_tail(&asoc->asocs, &ep->asocs); /* Increment the backlog value for a TCP-style listening socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) sk_acceptq_added(sk); } /* Free the endpoint structure. Delay cleanup until * all users have released their reference count on this structure. */ void sctp_endpoint_free(struct sctp_endpoint *ep) { ep->base.dead = true; inet_sk_set_state(ep->base.sk, SCTP_SS_CLOSED); /* Unlink this endpoint, so we can't find it again! */ sctp_unhash_endpoint(ep); sctp_endpoint_put(ep); } /* Final destructor for endpoint. */ static void sctp_endpoint_destroy_rcu(struct rcu_head *head) { struct sctp_endpoint *ep = container_of(head, struct sctp_endpoint, rcu); struct sock *sk = ep->base.sk; sctp_sk(sk)->ep = NULL; sock_put(sk); kfree(ep); SCTP_DBG_OBJCNT_DEC(ep); } static void sctp_endpoint_destroy(struct sctp_endpoint *ep) { struct sock *sk; if (unlikely(!ep->base.dead)) { WARN(1, "Attempt to destroy undead endpoint %p!\n", ep); return; } /* SCTP-AUTH: Free up AUTH releated data such as shared keys * chunks and hmacs arrays that were allocated */ sctp_auth_destroy_keys(&ep->endpoint_shared_keys); sctp_auth_free(ep); /* Cleanup. */ sctp_inq_free(&ep->base.inqueue); sctp_bind_addr_free(&ep->base.bind_addr); memzero_explicit(&ep->cookie_auth_key, sizeof(ep->cookie_auth_key)); sk = ep->base.sk; /* Remove and free the port */ if (sctp_sk(sk)->bind_hash) sctp_put_port(sk); call_rcu(&ep->rcu, sctp_endpoint_destroy_rcu); } /* Hold a reference to an endpoint. */ int sctp_endpoint_hold(struct sctp_endpoint *ep) { return refcount_inc_not_zero(&ep->base.refcnt); } /* Release a reference to an endpoint and clean up if there are * no more references. */ void sctp_endpoint_put(struct sctp_endpoint *ep) { if (refcount_dec_and_test(&ep->base.refcnt)) sctp_endpoint_destroy(ep); } /* Is this the endpoint we are looking for? */ struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, struct net *net, const union sctp_addr *laddr, int dif, int sdif) { int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if); struct sctp_endpoint *retval = NULL; if (net_eq(ep->base.net, net) && sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) && (htons(ep->base.bind_addr.port) == laddr->v4.sin_port)) { if (sctp_bind_addr_match(&ep->base.bind_addr, laddr, sctp_sk(ep->base.sk))) retval = ep; } return retval; } /* Find the association that goes with this chunk. * We lookup the transport from hashtable at first, then get association * through t->assoc. */ struct sctp_association *sctp_endpoint_lookup_assoc( const struct sctp_endpoint *ep, const union sctp_addr *paddr, struct sctp_transport **transport) { struct sctp_association *asoc = NULL; struct sctp_transport *t; *transport = NULL; /* If the local port is not set, there can't be any associations * on this endpoint. */ if (!ep->base.bind_addr.port) return NULL; rcu_read_lock(); t = sctp_epaddr_lookup_transport(ep, paddr); if (!t) goto out; *transport = t; asoc = t->asoc; out: rcu_read_unlock(); return asoc; } /* Look for any peeled off association from the endpoint that matches the * given peer address. */ bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr) { int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if); struct sctp_sockaddr_entry *addr; struct net *net = ep->base.net; struct sctp_bind_addr *bp; bp = &ep->base.bind_addr; /* This function is called with the socket lock held, * so the address_list can not change. */ list_for_each_entry(addr, &bp->address_list, list) { if (sctp_has_association(net, &addr->a, paddr, bound_dev_if, bound_dev_if)) return true; } return false; } /* Do delayed input processing. This is scheduled by sctp_rcv(). * This may be called on BH or task time. */ static void sctp_endpoint_bh_rcv(struct work_struct *work) { struct sctp_endpoint *ep = container_of(work, struct sctp_endpoint, base.inqueue.immediate); struct sctp_association *asoc; struct sock *sk; struct net *net; struct sctp_transport *transport; struct sctp_chunk *chunk; struct sctp_inq *inqueue; union sctp_subtype subtype; enum sctp_state state; int error = 0; int first_time = 1; /* is this the first time through the loop */ if (ep->base.dead) return; asoc = NULL; inqueue = &ep->base.inqueue; sk = ep->base.sk; net = sock_net(sk); while (NULL != (chunk = sctp_inq_pop(inqueue))) { subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type); /* If the first chunk in the packet is AUTH, do special * processing specified in Section 6.3 of SCTP-AUTH spec */ if (first_time && (subtype.chunk == SCTP_CID_AUTH)) { struct sctp_chunkhdr *next_hdr; next_hdr = sctp_inq_peek(inqueue); if (!next_hdr) goto normal; /* If the next chunk is COOKIE-ECHO, skip the AUTH * chunk while saving a pointer to it so we can do * Authentication later (during cookie-echo * processing). */ if (next_hdr->type == SCTP_CID_COOKIE_ECHO) { chunk->auth_chunk = skb_clone(chunk->skb, GFP_ATOMIC); chunk->auth = 1; continue; } } normal: /* We might have grown an association since last we * looked, so try again. * * This happens when we've just processed our * COOKIE-ECHO chunk. */ if (NULL == chunk->asoc) { asoc = sctp_endpoint_lookup_assoc(ep, sctp_source(chunk), &transport); chunk->asoc = asoc; chunk->transport = transport; } state = asoc ? asoc->state : SCTP_STATE_CLOSED; if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth) continue; /* Remember where the last DATA chunk came from so we * know where to send the SACK. */ if (asoc && sctp_chunk_is_data(chunk)) asoc->peer.last_data_from = chunk->transport; else { SCTP_INC_STATS(ep->base.net, SCTP_MIB_INCTRLCHUNKS); if (asoc) asoc->stats.ictrlchunks++; } if (chunk->transport) chunk->transport->last_time_heard = ktime_get(); error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype, state, ep, asoc, chunk, GFP_ATOMIC); if (error && chunk) chunk->pdiscard = 1; /* Check to see if the endpoint is freed in response to * the incoming chunk. If so, get out of the while loop. */ if (!sctp_sk(sk)->ep) break; if (first_time) first_time = 0; } } |
| 821 818 821 821 821 821 704 819 294 294 293 294 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | // SPDX-License-Identifier: GPL-2.0 /* * linux/drivers/char/misc.c * * Generic misc open routine by Johan Myreen * * Based on code from Linus * * Teemu Rantanen's Microsoft Busmouse support and Derrick Cole's * changes incorporated into 0.97pl4 * by Peter Cervasio (pete%q106fm.uucp@wupost.wustl.edu) (08SEP92) * See busmouse.c for particulars. * * Made things a lot mode modular - easy to compile in just one or two * of the misc drivers, as they are now completely independent. Linus. * * Support for loadable modules. 8-Sep-95 Philip Blundell <pjb27@cam.ac.uk> * * Fixed a failing symbol register to free the device registration * Alan Cox <alan@lxorguk.ukuu.org.uk> 21-Jan-96 * * Dynamic minors and /proc/mice by Alessandro Rubini. 26-Mar-96 * * Renamed to misc and miscdevice to be more accurate. Alan Cox 26-Mar-96 * * Handling of mouse minor numbers for kerneld: * Idea by Jacques Gelinas <jack@solucorp.qc.ca>, * adapted by Bjorn Ekwall <bj0rn@blox.se> * corrected by Alan Cox <alan@lxorguk.ukuu.org.uk> * * Changes for kmod (from kerneld): * Cyrus Durgin <cider@speakeasy.org> * * Added devfs support. Richard Gooch <rgooch@atnf.csiro.au> 10-Jan-1998 */ #include <linux/module.h> #include <linux/fs.h> #include <linux/errno.h> #include <linux/miscdevice.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/device.h> #include <linux/tty.h> #include <linux/kmod.h> #include <linux/gfp.h> /* * Head entry for the doubly linked miscdevice list */ static LIST_HEAD(misc_list); static DEFINE_MUTEX(misc_mtx); /* * Assigned numbers. */ static DEFINE_IDA(misc_minors_ida); static int misc_minor_alloc(int minor) { int ret = 0; if (minor == MISC_DYNAMIC_MINOR) { /* allocate free id */ ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1, MINORMASK, GFP_KERNEL); } else { ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL); } return ret; } static void misc_minor_free(int minor) { ida_free(&misc_minors_ida, minor); } #ifdef CONFIG_PROC_FS static void *misc_seq_start(struct seq_file *seq, loff_t *pos) { mutex_lock(&misc_mtx); return seq_list_start(&misc_list, *pos); } static void *misc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &misc_list, pos); } static void misc_seq_stop(struct seq_file *seq, void *v) { mutex_unlock(&misc_mtx); } static int misc_seq_show(struct seq_file *seq, void *v) { const struct miscdevice *p = list_entry(v, struct miscdevice, list); seq_printf(seq, "%3i %s\n", p->minor, p->name ? p->name : ""); return 0; } static const struct seq_operations misc_seq_ops = { .start = misc_seq_start, .next = misc_seq_next, .stop = misc_seq_stop, .show = misc_seq_show, }; #endif static int misc_open(struct inode *inode, struct file *file) { int minor = iminor(inode); struct miscdevice *c = NULL, *iter; int err = -ENODEV; const struct file_operations *new_fops = NULL; mutex_lock(&misc_mtx); list_for_each_entry(iter, &misc_list, list) { if (iter->minor != minor) continue; c = iter; new_fops = fops_get(iter->fops); break; } /* Only request module for fixed minor code */ if (!new_fops && minor < MISC_DYNAMIC_MINOR) { mutex_unlock(&misc_mtx); request_module("char-major-%d-%d", MISC_MAJOR, minor); mutex_lock(&misc_mtx); list_for_each_entry(iter, &misc_list, list) { if (iter->minor != minor) continue; c = iter; new_fops = fops_get(iter->fops); break; } } if (!new_fops) goto fail; /* * Place the miscdevice in the file's * private_data so it can be used by the * file operations, including f_op->open below */ file->private_data = c; err = 0; replace_fops(file, new_fops); if (file->f_op->open) err = file->f_op->open(inode, file); fail: mutex_unlock(&misc_mtx); return err; } static char *misc_devnode(const struct device *dev, umode_t *mode) { const struct miscdevice *c = dev_get_drvdata(dev); if (mode && c->mode) *mode = c->mode; if (c->nodename) return kstrdup(c->nodename, GFP_KERNEL); return NULL; } static const struct class misc_class = { .name = "misc", .devnode = misc_devnode, }; static const struct file_operations misc_fops = { .owner = THIS_MODULE, .open = misc_open, .llseek = noop_llseek, }; /** * misc_register - register a miscellaneous device * @misc: device structure * * Register a miscellaneous device with the kernel. If the minor * number is set to %MISC_DYNAMIC_MINOR a minor number is assigned * and placed in the minor field of the structure. For other cases * the minor number requested is used. * * The structure passed is linked into the kernel and may not be * destroyed until it has been unregistered. By default, an open() * syscall to the device sets file->private_data to point to the * structure. Drivers don't need open in fops for this. * * A zero is returned on success and a negative errno code for * failure. */ int misc_register(struct miscdevice *misc) { dev_t dev; int err = 0; bool is_dynamic = (misc->minor == MISC_DYNAMIC_MINOR); if (misc->minor > MISC_DYNAMIC_MINOR) { pr_err("Invalid fixed minor %d for miscdevice '%s'\n", misc->minor, misc->name); return -EINVAL; } INIT_LIST_HEAD(&misc->list); mutex_lock(&misc_mtx); if (is_dynamic) { int i = misc_minor_alloc(misc->minor); if (i < 0) { err = -EBUSY; goto out; } misc->minor = i; } else { struct miscdevice *c; int i; list_for_each_entry(c, &misc_list, list) { if (c->minor == misc->minor) { err = -EBUSY; goto out; } } i = misc_minor_alloc(misc->minor); if (i < 0) { err = -EBUSY; goto out; } } dev = MKDEV(MISC_MAJOR, misc->minor); misc->this_device = device_create_with_groups(&misc_class, misc->parent, dev, misc, misc->groups, "%s", misc->name); if (IS_ERR(misc->this_device)) { misc_minor_free(misc->minor); if (is_dynamic) { misc->minor = MISC_DYNAMIC_MINOR; } err = PTR_ERR(misc->this_device); goto out; } /* * Add it to the front, so that later devices can "override" * earlier defaults */ list_add(&misc->list, &misc_list); out: mutex_unlock(&misc_mtx); return err; } EXPORT_SYMBOL(misc_register); /** * misc_deregister - unregister a miscellaneous device * @misc: device to unregister * * Unregister a miscellaneous device that was previously * successfully registered with misc_register(). */ void misc_deregister(struct miscdevice *misc) { mutex_lock(&misc_mtx); list_del_init(&misc->list); device_destroy(&misc_class, MKDEV(MISC_MAJOR, misc->minor)); misc_minor_free(misc->minor); if (misc->minor > MISC_DYNAMIC_MINOR) misc->minor = MISC_DYNAMIC_MINOR; mutex_unlock(&misc_mtx); } EXPORT_SYMBOL(misc_deregister); static int __init misc_init(void) { int err; struct proc_dir_entry *misc_proc_file; misc_proc_file = proc_create_seq("misc", 0, NULL, &misc_seq_ops); err = class_register(&misc_class); if (err) goto fail_remove; err = __register_chrdev(MISC_MAJOR, 0, MINORMASK + 1, "misc", &misc_fops); if (err < 0) goto fail_printk; return 0; fail_printk: pr_err("unable to get major %d for misc devices\n", MISC_MAJOR); class_unregister(&misc_class); fail_remove: if (misc_proc_file) remove_proc_entry("misc", NULL); return err; } subsys_initcall(misc_init); |
| 7 6 7 7 7 7 7 7 3 12 12 12 28 28 10 9 28 1 3 2 4 3 1 3 1 2 2 2 2 2 5 1 4 6 1 6 3 3 16 6 4 3 4 8 5 25 19 26 9 28 7 29 29 29 29 29 29 29 29 3 29 3 28 29 29 95 95 94 94 94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux I2C core SMBus and SMBus emulation code * * This file contains the SMBus functions which are always included in the I2C * core because they can be emulated via I2C. SMBus specific extensions * (e.g. smbalert) are handled in a separate i2c-smbus module. * * All SMBus-related things are written by Frodo Looijaard <frodol@dds.nl> * SMBus 2.0 support by Mark Studebaker <mdsxyz123@yahoo.com> and * Jean Delvare <jdelvare@suse.de> */ #include <linux/device.h> #include <linux/err.h> #include <linux/i2c.h> #include <linux/i2c-smbus.h> #include <linux/property.h> #include <linux/slab.h> #include <linux/string_choices.h> #include "i2c-core.h" #define CREATE_TRACE_POINTS #include <trace/events/smbus.h> /* The SMBus parts */ #define POLY (0x1070U << 3) static u8 crc8(u16 data) { int i; for (i = 0; i < 8; i++) { if (data & 0x8000) data = data ^ POLY; data = data << 1; } return (u8)(data >> 8); } /** * i2c_smbus_pec - Incremental CRC8 over the given input data array * @crc: previous return crc8 value * @p: pointer to data buffer. * @count: number of bytes in data buffer. * * Incremental CRC8 over count bytes in the array pointed to by p */ u8 i2c_smbus_pec(u8 crc, u8 *p, size_t count) { int i; for (i = 0; i < count; i++) crc = crc8((crc ^ p[i]) << 8); return crc; } EXPORT_SYMBOL(i2c_smbus_pec); /* Assume a 7-bit address, which is reasonable for SMBus */ static u8 i2c_smbus_msg_pec(u8 pec, struct i2c_msg *msg) { /* The address will be sent first */ u8 addr = i2c_8bit_addr_from_msg(msg); pec = i2c_smbus_pec(pec, &addr, 1); /* The data buffer follows */ return i2c_smbus_pec(pec, msg->buf, msg->len); } /* Used for write only transactions */ static inline void i2c_smbus_add_pec(struct i2c_msg *msg) { msg->buf[msg->len] = i2c_smbus_msg_pec(0, msg); msg->len++; } /* Return <0 on CRC error If there was a write before this read (most cases) we need to take the partial CRC from the write part into account. Note that this function does modify the message (we need to decrease the message length to hide the CRC byte from the caller). */ static int i2c_smbus_check_pec(u8 cpec, struct i2c_msg *msg) { u8 rpec = msg->buf[--msg->len]; cpec = i2c_smbus_msg_pec(cpec, msg); if (rpec != cpec) { pr_debug("Bad PEC 0x%02x vs. 0x%02x\n", rpec, cpec); return -EBADMSG; } return 0; } /** * i2c_smbus_read_byte - SMBus "receive byte" protocol * @client: Handle to slave device * * This executes the SMBus "receive byte" protocol, returning negative errno * else the byte received from the device. */ s32 i2c_smbus_read_byte(const struct i2c_client *client) { union i2c_smbus_data data; int status; status = i2c_smbus_xfer(client->adapter, client->addr, client->flags, I2C_SMBUS_READ, 0, I2C_SMBUS_BYTE, &data); return (status < 0) ? status : data.byte; } EXPORT_SYMBOL(i2c_smbus_read_byte); /** * i2c_smbus_write_byte - SMBus "send byte" protocol * @client: Handle to slave device * @value: Byte to be sent * * This executes the SMBus "send byte" protocol, returning negative errno * else zero on success. */ s32 i2c_smbus_write_byte(const struct i2c_client *client, u8 value) { return i2c_smbus_xfer(client->adapter, client->addr, client->flags, I2C_SMBUS_WRITE, value, I2C_SMBUS_BYTE, NULL); } EXPORT_SYMBOL(i2c_smbus_write_byte); /** * i2c_smbus_read_byte_data - SMBus "read byte" protocol * @client: Handle to slave device * @command: Byte interpreted by slave * * This executes the SMBus "read byte" protocol, returning negative errno * else a data byte received from the device. */ s32 i2c_smbus_read_byte_data(const struct i2c_client *client, u8 command) { union i2c_smbus_data data; int status; status = i2c_smbus_xfer(client->adapter, client->addr, client->flags, I2C_SMBUS_READ, command, I2C_SMBUS_BYTE_DATA, &data); return (status < 0) ? status : data.byte; } EXPORT_SYMBOL(i2c_smbus_read_byte_data); /** * i2c_smbus_write_byte_data - SMBus "write byte" protocol * @client: Handle to slave device * @command: Byte interpreted by slave * @value: Byte being written * * This executes the SMBus "write byte" protocol, returning negative errno * else zero on success. */ s32 i2c_smbus_write_byte_data(const struct i2c_client *client, u8 command, u8 value) { union i2c_smbus_data data; data.byte = value; return i2c_smbus_xfer(client->adapter, client->addr, client->flags, I2C_SMBUS_WRITE, command, I2C_SMBUS_BYTE_DATA, &data); } EXPORT_SYMBOL(i2c_smbus_write_byte_data); /** * i2c_smbus_read_word_data - SMBus "read word" protocol * @client: Handle to slave device * @command: Byte interpreted by slave * * This executes the SMBus "read word" protocol, returning negative errno * else a 16-bit unsigned "word" received from the device. */ s32 i2c_smbus_read_word_data(const struct i2c_client *client, u8 command) { union i2c_smbus_data data; int status; status = i2c_smbus_xfer(client->adapter, client->addr, client->flags, I2C_SMBUS_READ, command, I2C_SMBUS_WORD_DATA, &data); return (status < 0) ? status : data.word; } EXPORT_SYMBOL(i2c_smbus_read_word_data); /** * i2c_smbus_write_word_data - SMBus "write word" protocol * @client: Handle to slave device * @command: Byte interpreted by slave * @value: 16-bit "word" being written * * This executes the SMBus "write word" protocol, returning negative errno * else zero on success. */ s32 i2c_smbus_write_word_data(const struct i2c_client *client, u8 command, u16 value) { union i2c_smbus_data data; data.word = value; return i2c_smbus_xfer(client->adapter, client->addr, client->flags, I2C_SMBUS_WRITE, command, I2C_SMBUS_WORD_DATA, &data); } EXPORT_SYMBOL(i2c_smbus_write_word_data); /** * i2c_smbus_read_block_data - SMBus "block read" protocol * @client: Handle to slave device * @command: Byte interpreted by slave * @values: Byte array into which data will be read; big enough to hold * the data returned by the slave. SMBus allows at most 32 bytes. * * This executes the SMBus "block read" protocol, returning negative errno * else the number of data bytes in the slave's response. * * Note that using this function requires that the client's adapter support * the I2C_FUNC_SMBUS_READ_BLOCK_DATA functionality. Not all adapter drivers * support this; its emulation through I2C messaging relies on a specific * mechanism (I2C_M_RECV_LEN) which may not be implemented. */ s32 i2c_smbus_read_block_data(const struct i2c_client *client, u8 command, u8 *values) { union i2c_smbus_data data; int status; status = i2c_smbus_xfer(client->adapter, client->addr, client->flags, I2C_SMBUS_READ, command, I2C_SMBUS_BLOCK_DATA, &data); if (status) return status; memcpy(values, &data.block[1], data.block[0]); return data.block[0]; } EXPORT_SYMBOL(i2c_smbus_read_block_data); /** * i2c_smbus_write_block_data - SMBus "block write" protocol * @client: Handle to slave device * @command: Byte interpreted by slave * @length: Size of data block; SMBus allows at most 32 bytes * @values: Byte array which will be written. * * This executes the SMBus "block write" protocol, returning negative errno * else zero on success. */ s32 i2c_smbus_write_block_data(const struct i2c_client *client, u8 command, u8 length, const u8 *values) { union i2c_smbus_data data; if (length > I2C_SMBUS_BLOCK_MAX) length = I2C_SMBUS_BLOCK_MAX; data.block[0] = length; memcpy(&data.block[1], values, length); return i2c_smbus_xfer(client->adapter, client->addr, client->flags, I2C_SMBUS_WRITE, command, I2C_SMBUS_BLOCK_DATA, &data); } EXPORT_SYMBOL(i2c_smbus_write_block_data); /* Returns the number of read bytes */ s32 i2c_smbus_read_i2c_block_data(const struct i2c_client *client, u8 command, u8 length, u8 *values) { union i2c_smbus_data data; int status; if (length > I2C_SMBUS_BLOCK_MAX) length = I2C_SMBUS_BLOCK_MAX; data.block[0] = length; status = i2c_smbus_xfer(client->adapter, client->addr, client->flags, I2C_SMBUS_READ, command, I2C_SMBUS_I2C_BLOCK_DATA, &data); if (status < 0) return status; memcpy(values, &data.block[1], data.block[0]); return data.block[0]; } EXPORT_SYMBOL(i2c_smbus_read_i2c_block_data); s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client, u8 command, u8 length, const u8 *values) { union i2c_smbus_data data; if (length > I2C_SMBUS_BLOCK_MAX) length = I2C_SMBUS_BLOCK_MAX; data.block[0] = length; memcpy(data.block + 1, values, length); return i2c_smbus_xfer(client->adapter, client->addr, client->flags, I2C_SMBUS_WRITE, command, I2C_SMBUS_I2C_BLOCK_DATA, &data); } EXPORT_SYMBOL(i2c_smbus_write_i2c_block_data); static void i2c_smbus_try_get_dmabuf(struct i2c_msg *msg, u8 init_val) { bool is_read = msg->flags & I2C_M_RD; unsigned char *dma_buf; dma_buf = kzalloc(I2C_SMBUS_BLOCK_MAX + (is_read ? 2 : 3), GFP_KERNEL); if (!dma_buf) return; msg->buf = dma_buf; msg->flags |= I2C_M_DMA_SAFE; if (init_val) msg->buf[0] = init_val; } /* * Simulate a SMBus command using the I2C protocol. * No checking of parameters is done! */ static s32 i2c_smbus_xfer_emulated(struct i2c_adapter *adapter, u16 addr, unsigned short flags, char read_write, u8 command, int size, union i2c_smbus_data *data) { /* * So we need to generate a series of msgs. In the case of writing, we * need to use only one message; when reading, we need two. We * initialize most things with sane defaults, to keep the code below * somewhat simpler. */ unsigned char msgbuf0[I2C_SMBUS_BLOCK_MAX+3]; unsigned char msgbuf1[I2C_SMBUS_BLOCK_MAX+2]; int nmsgs = read_write == I2C_SMBUS_READ ? 2 : 1; u8 partial_pec = 0; int status; struct i2c_msg msg[2] = { { .addr = addr, .flags = flags, .len = 1, .buf = msgbuf0, }, { .addr = addr, .flags = flags | I2C_M_RD, .len = 0, .buf = msgbuf1, }, }; bool wants_pec = ((flags & I2C_CLIENT_PEC) && size != I2C_SMBUS_QUICK && size != I2C_SMBUS_I2C_BLOCK_DATA); msgbuf0[0] = command; switch (size) { case I2C_SMBUS_QUICK: msg[0].len = 0; /* Special case: The read/write field is used as data */ msg[0].flags = flags | (read_write == I2C_SMBUS_READ ? I2C_M_RD : 0); nmsgs = 1; break; case I2C_SMBUS_BYTE: if (read_write == I2C_SMBUS_READ) { /* Special case: only a read! */ msg[0].flags = I2C_M_RD | flags; nmsgs = 1; } break; case I2C_SMBUS_BYTE_DATA: if (read_write == I2C_SMBUS_READ) msg[1].len = 1; else { msg[0].len = 2; msgbuf0[1] = data->byte; } break; case I2C_SMBUS_WORD_DATA: if (read_write == I2C_SMBUS_READ) msg[1].len = 2; else { msg[0].len = 3; msgbuf0[1] = data->word & 0xff; msgbuf0[2] = data->word >> 8; } break; case I2C_SMBUS_PROC_CALL: nmsgs = 2; /* Special case */ read_write = I2C_SMBUS_READ; msg[0].len = 3; msg[1].len = 2; msgbuf0[1] = data->word & 0xff; msgbuf0[2] = data->word >> 8; break; case I2C_SMBUS_BLOCK_DATA: if (read_write == I2C_SMBUS_READ) { msg[1].flags |= I2C_M_RECV_LEN; msg[1].len = 1; /* block length will be added by the underlying bus driver */ i2c_smbus_try_get_dmabuf(&msg[1], 0); } else { msg[0].len = data->block[0] + 2; if (msg[0].len > I2C_SMBUS_BLOCK_MAX + 2) { dev_err(&adapter->dev, "Invalid block write size %d\n", data->block[0]); return -EINVAL; } i2c_smbus_try_get_dmabuf(&msg[0], command); memcpy(msg[0].buf + 1, data->block, msg[0].len - 1); } break; case I2C_SMBUS_BLOCK_PROC_CALL: nmsgs = 2; /* Another special case */ read_write = I2C_SMBUS_READ; if (data->block[0] > I2C_SMBUS_BLOCK_MAX) { dev_err(&adapter->dev, "Invalid block write size %d\n", data->block[0]); return -EINVAL; } msg[0].len = data->block[0] + 2; i2c_smbus_try_get_dmabuf(&msg[0], command); memcpy(msg[0].buf + 1, data->block, msg[0].len - 1); msg[1].flags |= I2C_M_RECV_LEN; msg[1].len = 1; /* block length will be added by the underlying bus driver */ i2c_smbus_try_get_dmabuf(&msg[1], 0); break; case I2C_SMBUS_I2C_BLOCK_DATA: if (data->block[0] > I2C_SMBUS_BLOCK_MAX) { dev_err(&adapter->dev, "Invalid block %s size %d\n", str_read_write(read_write == I2C_SMBUS_READ), data->block[0]); return -EINVAL; } if (read_write == I2C_SMBUS_READ) { msg[1].len = data->block[0]; i2c_smbus_try_get_dmabuf(&msg[1], 0); } else { msg[0].len = data->block[0] + 1; i2c_smbus_try_get_dmabuf(&msg[0], command); memcpy(msg[0].buf + 1, data->block + 1, data->block[0]); } break; default: dev_err(&adapter->dev, "Unsupported transaction %d\n", size); return -EOPNOTSUPP; } if (wants_pec) { /* Compute PEC if first message is a write */ if (!(msg[0].flags & I2C_M_RD)) { if (nmsgs == 1) /* Write only */ i2c_smbus_add_pec(&msg[0]); else /* Write followed by read */ partial_pec = i2c_smbus_msg_pec(0, &msg[0]); } /* Ask for PEC if last message is a read */ if (msg[nmsgs - 1].flags & I2C_M_RD) msg[nmsgs - 1].len++; } status = __i2c_transfer(adapter, msg, nmsgs); if (status < 0) goto cleanup; if (status != nmsgs) { status = -EIO; goto cleanup; } status = 0; /* Check PEC if last message is a read */ if (wants_pec && (msg[nmsgs - 1].flags & I2C_M_RD)) { status = i2c_smbus_check_pec(partial_pec, &msg[nmsgs - 1]); if (status < 0) goto cleanup; } if (read_write == I2C_SMBUS_READ) switch (size) { case I2C_SMBUS_BYTE: data->byte = msgbuf0[0]; break; case I2C_SMBUS_BYTE_DATA: data->byte = msgbuf1[0]; break; case I2C_SMBUS_WORD_DATA: case I2C_SMBUS_PROC_CALL: data->word = msgbuf1[0] | (msgbuf1[1] << 8); break; case I2C_SMBUS_I2C_BLOCK_DATA: memcpy(data->block + 1, msg[1].buf, data->block[0]); break; case I2C_SMBUS_BLOCK_DATA: case I2C_SMBUS_BLOCK_PROC_CALL: if (msg[1].buf[0] > I2C_SMBUS_BLOCK_MAX) { dev_err(&adapter->dev, "Invalid block size returned: %d\n", msg[1].buf[0]); status = -EPROTO; goto cleanup; } memcpy(data->block, msg[1].buf, msg[1].buf[0] + 1); break; } cleanup: if (msg[0].flags & I2C_M_DMA_SAFE) kfree(msg[0].buf); if (msg[1].flags & I2C_M_DMA_SAFE) kfree(msg[1].buf); return status; } /** * i2c_smbus_xfer - execute SMBus protocol operations * @adapter: Handle to I2C bus * @addr: Address of SMBus slave on that bus * @flags: I2C_CLIENT_* flags (usually zero or I2C_CLIENT_PEC) * @read_write: I2C_SMBUS_READ or I2C_SMBUS_WRITE * @command: Byte interpreted by slave, for protocols which use such bytes * @protocol: SMBus protocol operation to execute, such as I2C_SMBUS_PROC_CALL * @data: Data to be read or written * * This executes an SMBus protocol operation, and returns a negative * errno code else zero on success. */ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags, char read_write, u8 command, int protocol, union i2c_smbus_data *data) { s32 res; res = __i2c_lock_bus_helper(adapter); if (res) return res; res = __i2c_smbus_xfer(adapter, addr, flags, read_write, command, protocol, data); i2c_unlock_bus(adapter, I2C_LOCK_SEGMENT); return res; } EXPORT_SYMBOL(i2c_smbus_xfer); s32 __i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags, char read_write, u8 command, int protocol, union i2c_smbus_data *data) { int (*xfer_func)(struct i2c_adapter *adap, u16 addr, unsigned short flags, char read_write, u8 command, int size, union i2c_smbus_data *data); unsigned long orig_jiffies; int try; s32 res; res = __i2c_check_suspended(adapter); if (res) return res; /* If enabled, the following two tracepoints are conditional on * read_write and protocol. */ trace_smbus_write(adapter, addr, flags, read_write, command, protocol, data); trace_smbus_read(adapter, addr, flags, read_write, command, protocol); flags &= I2C_M_TEN | I2C_CLIENT_PEC | I2C_CLIENT_SCCB; xfer_func = adapter->algo->smbus_xfer; if (i2c_in_atomic_xfer_mode()) { if (adapter->algo->smbus_xfer_atomic) xfer_func = adapter->algo->smbus_xfer_atomic; else if (adapter->algo->master_xfer_atomic) xfer_func = NULL; /* fallback to I2C emulation */ } if (xfer_func) { /* Retry automatically on arbitration loss */ orig_jiffies = jiffies; for (res = 0, try = 0; try <= adapter->retries; try++) { res = xfer_func(adapter, addr, flags, read_write, command, protocol, data); if (res != -EAGAIN) break; if (time_after(jiffies, orig_jiffies + adapter->timeout)) break; } if (res != -EOPNOTSUPP || !adapter->algo->master_xfer) goto trace; /* * Fall back to i2c_smbus_xfer_emulated if the adapter doesn't * implement native support for the SMBus operation. */ } res = i2c_smbus_xfer_emulated(adapter, addr, flags, read_write, command, protocol, data); trace: /* If enabled, the reply tracepoint is conditional on read_write. */ trace_smbus_reply(adapter, addr, flags, read_write, command, protocol, data, res); trace_smbus_result(adapter, addr, flags, read_write, command, protocol, res); return res; } EXPORT_SYMBOL(__i2c_smbus_xfer); /** * i2c_smbus_read_i2c_block_data_or_emulated - read block or emulate * @client: Handle to slave device * @command: Byte interpreted by slave * @length: Size of data block; SMBus allows at most I2C_SMBUS_BLOCK_MAX bytes * @values: Byte array into which data will be read; big enough to hold * the data returned by the slave. SMBus allows at most * I2C_SMBUS_BLOCK_MAX bytes. * * This executes the SMBus "block read" protocol if supported by the adapter. * If block read is not supported, it emulates it using either word or byte * read protocols depending on availability. * * The addresses of the I2C slave device that are accessed with this function * must be mapped to a linear region, so that a block read will have the same * effect as a byte read. Before using this function you must double-check * if the I2C slave does support exchanging a block transfer with a byte * transfer. */ s32 i2c_smbus_read_i2c_block_data_or_emulated(const struct i2c_client *client, u8 command, u8 length, u8 *values) { u8 i = 0; int status; if (length > I2C_SMBUS_BLOCK_MAX) length = I2C_SMBUS_BLOCK_MAX; if (i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_READ_I2C_BLOCK)) return i2c_smbus_read_i2c_block_data(client, command, length, values); if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_READ_BYTE_DATA)) return -EOPNOTSUPP; if (i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_READ_WORD_DATA)) { while ((i + 2) <= length) { status = i2c_smbus_read_word_data(client, command + i); if (status < 0) return status; values[i] = status & 0xff; values[i + 1] = status >> 8; i += 2; } } while (i < length) { status = i2c_smbus_read_byte_data(client, command + i); if (status < 0) return status; values[i] = status; i++; } return i; } EXPORT_SYMBOL(i2c_smbus_read_i2c_block_data_or_emulated); /** * i2c_new_smbus_alert_device - get ara client for SMBus alert support * @adapter: the target adapter * @setup: setup data for the SMBus alert handler * Context: can sleep * * Setup handling of the SMBus alert protocol on a given I2C bus segment. * * Handling can be done either through our IRQ handler, or by the * adapter (from its handler, periodic polling, or whatever). * * This returns the ara client, which should be saved for later use with * i2c_handle_smbus_alert() and ultimately i2c_unregister_device(); or an * ERRPTR to indicate an error. */ struct i2c_client *i2c_new_smbus_alert_device(struct i2c_adapter *adapter, struct i2c_smbus_alert_setup *setup) { struct i2c_board_info ara_board_info = { I2C_BOARD_INFO("smbus_alert", 0x0c), .platform_data = setup, }; return i2c_new_client_device(adapter, &ara_board_info); } EXPORT_SYMBOL_GPL(i2c_new_smbus_alert_device); #if IS_ENABLED(CONFIG_I2C_SMBUS) int i2c_setup_smbus_alert(struct i2c_adapter *adapter) { struct device *parent = adapter->dev.parent; int irq; /* Adapter instantiated without parent, skip the SMBus alert setup */ if (!parent) return 0; /* Report serious errors */ irq = device_property_match_string(parent, "interrupt-names", "smbus_alert"); if (irq < 0 && irq != -EINVAL && irq != -ENODATA) return irq; /* Skip setup when no irq was found */ if (irq < 0 && !device_property_present(parent, "smbalert-gpios")) return 0; return PTR_ERR_OR_ZERO(i2c_new_smbus_alert_device(adapter, NULL)); } #endif |
| 78 35 16 3 5 114 165 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xdp #if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_XDP_H #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/tracepoint.h> #include <linux/bpf.h> #include <net/xdp.h> #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ FN(DROP) \ FN(PASS) \ FN(TX) \ FN(REDIRECT) #define __XDP_ACT_TP_FN(x) \ TRACE_DEFINE_ENUM(XDP_##x); #define __XDP_ACT_SYM_FN(x) \ { XDP_##x, #x }, #define __XDP_ACT_SYM_TAB \ __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL } __XDP_ACT_MAP(__XDP_ACT_TP_FN) TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), TP_ARGS(dev, xdp, act), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) ), TP_fast_assign( __entry->prog_id = xdp->aux->id; __entry->act = act; __entry->ifindex = dev->ifindex; ), TP_printk("prog_id=%d action=%s ifindex=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex) ); TRACE_EVENT(xdp_bulk_tx, TP_PROTO(const struct net_device *dev, int sent, int drops, int err), TP_ARGS(dev, sent, drops, err), TP_STRUCT__entry( __field(int, ifindex) __field(u32, act) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->ifindex = dev->ifindex; __entry->act = XDP_TX; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", __entry->ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); #ifndef __DEVMAP_OBJ_TYPE #define __DEVMAP_OBJ_TYPE struct _bpf_dtab_netdev { struct net_device *dev; }; #endif /* __DEVMAP_OBJ_TYPE */ DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) __field(int, err) __field(int, to_ifindex) __field(u32, map_id) __field(int, map_index) ), TP_fast_assign( u32 ifindex = 0, map_index = index; if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Just leave to_ifindex to 0 if do broadcast redirect, * as tgt will be NULL. */ if (tgt) ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { ifindex = index; map_index = 0; } __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; __entry->to_ifindex = ifindex; __entry->map_id = map_id; __entry->map_index = map_index; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" " map_id=%d map_index=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex, __entry->to_ifindex, __entry->err, __entry->map_id, __entry->map_index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); #define _trace_xdp_redirect(dev, xdp, to) \ trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_err(dev, xdp, to, err) \ trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \ trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index) #define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \ trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index) #ifdef CONFIG_BPF_SYSCALL TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats), TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) __field(unsigned int, xdp_pass) __field(unsigned int, xdp_drop) __field(unsigned int, xdp_redirect) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; __entry->xdp_pass = xdp_stats->pass; __entry->xdp_drop = xdp_stats->drop; __entry->xdp_redirect = xdp_stats->redirect; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " sched=%d" " xdp_pass=%u xdp_drop=%u xdp_redirect=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->sched, __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect) ); TRACE_EVENT(xdp_cpumap_enqueue, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int to_cpu), TP_ARGS(map_id, processed, drops, to_cpu), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, to_cpu) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->to_cpu = to_cpu; ), TP_printk("enqueue" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " to_cpu=%d", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->to_cpu) ); TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct net_device *from_dev, const struct net_device *to_dev, int sent, int drops, int err), TP_ARGS(from_dev, to_dev, sent, drops, err), TP_STRUCT__entry( __field(int, from_ifindex) __field(u32, act) __field(int, to_ifindex) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->from_ifindex = from_dev->ifindex; __entry->act = XDP_REDIRECT; __entry->to_ifindex = to_dev->ifindex; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ndo_xdp_xmit" " from_ifindex=%d to_ifindex=%d action=%s" " sent=%d drops=%d" " err=%d", __entry->from_ifindex, __entry->to_ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); #endif /* CONFIG_BPF_SYSCALL */ /* Expect users already include <net/xdp.h>, but not xdp_priv.h */ #include <net/xdp_priv.h> #define __MEM_TYPE_MAP(FN) \ FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); #define __MEM_TYPE_SYM_FN(x) \ { MEM_TYPE_##x, #x }, #define __MEM_TYPE_SYM_TAB \ __MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 } __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) TRACE_EVENT(mem_disconnect, TP_PROTO(const struct xdp_mem_allocator *xa), TP_ARGS(xa), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; ), TP_printk("mem_id=%d mem_type=%s allocator=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator ) ); TRACE_EVENT(mem_connect, TP_PROTO(const struct xdp_mem_allocator *xa, const struct xdp_rxq_info *rxq), TP_ARGS(xa, rxq), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) __field(const struct xdp_rxq_info *, rxq) __field(int, ifindex) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; __entry->rxq = rxq; __entry->ifindex = rxq->dev->ifindex; ), TP_printk("mem_id=%d mem_type=%s allocator=%p" " ifindex=%d", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator, __entry->ifindex ) ); TRACE_EVENT(bpf_xdp_link_attach_failed, TP_PROTO(const char *msg), TP_ARGS(msg), TP_STRUCT__entry( __string(msg, msg) ), TP_fast_assign( __assign_str(msg); ), TP_printk("errmsg=%s", __get_str(msg)) ); #endif /* _TRACE_XDP_H */ #include <trace/define_trace.h> |
| 1 1 2 1 1 1 1 1 2 3 3 2 3 4 2 2 2 2 1 4 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 3 2 1 1 4 3 2 3 3 3 1 3 1 1 1 1 1 1 1 1 1 1 3 3 2 3 1 6 5 5 5 5 5 5 5 5 5 8 8 8 7 8 5 4 3 3 3 2 1 2 1 1 2 4 5 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 4 5 5 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 | // SPDX-License-Identifier: GPL-2.0 #include <linux/anon_inodes.h> #include <linux/backing-dev.h> #include <linux/falloc.h> #include <linux/fs.h> #include <linux/kvm_host.h> #include <linux/mempolicy.h> #include <linux/pseudo_fs.h> #include <linux/pagemap.h> #include "kvm_mm.h" static struct vfsmount *kvm_gmem_mnt; /* * A guest_memfd instance can be associated multiple VMs, each with its own * "view" of the underlying physical memory. * * The gmem's inode is effectively the raw underlying physical storage, and is * used to track properties of the physical memory, while each gmem file is * effectively a single VM's view of that storage, and is used to track assets * specific to its associated VM, e.g. memslots=>gmem bindings. */ struct gmem_file { struct kvm *kvm; struct xarray bindings; struct list_head entry; }; struct gmem_inode { struct shared_policy policy; struct inode vfs_inode; u64 flags; }; static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) { return container_of(inode, struct gmem_inode, vfs_inode); } #define kvm_gmem_for_each_file(f, mapping) \ list_for_each_entry(f, &(mapping)->i_private_list, entry) /** * folio_file_pfn - like folio_file_page, but return a pfn. * @folio: The folio which contains this index. * @index: The index we want to look up. * * Return: The pfn for this index. */ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) { return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); } static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) { return gfn - slot->base_gfn + slot->gmem.pgoff; } static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, pgoff_t index, struct folio *folio) { #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE kvm_pfn_t pfn = folio_file_pfn(folio, index); gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); if (rc) { pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", index, gfn, pfn, rc); return rc; } #endif return 0; } static inline void kvm_gmem_mark_prepared(struct folio *folio) { folio_mark_uptodate(folio); } /* * Process @folio, which contains @gfn, so that the guest can use it. * The folio must be locked and the gfn must be contained in @slot. * On successful return the guest sees a zero page so as to avoid * leaking host data and the up-to-date flag is set. */ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, struct folio *folio) { unsigned long nr_pages, i; pgoff_t index; int r; nr_pages = folio_nr_pages(folio); for (i = 0; i < nr_pages; i++) clear_highpage(folio_page(folio, i)); /* * Preparing huge folios should always be safe, since it should * be possible to split them later if needed. * * Right now the folio order is always going to be zero, but the * code is ready for huge folios. The only assumption is that * the base pgoff of memslots is naturally aligned with the * requested page order, ensuring that huge folios can also use * huge page table entries for GPA->HPA mapping. * * The order will be passed when creating the guest_memfd, and * checked when creating memslots. */ WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio))); index = kvm_gmem_get_index(slot, gfn); index = ALIGN_DOWN(index, folio_nr_pages(folio)); r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); if (!r) kvm_gmem_mark_prepared(folio); return r; } /* * Returns a locked folio on success. The caller is responsible for * setting the up-to-date flag before the memory is mapped into the guest. * There is no backing storage for the memory, so the folio will remain * up-to-date until it's removed. * * Ignore accessed, referenced, and dirty flags. The memory is * unevictable and there is no storage to write back to. */ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { /* TODO: Support huge pages. */ struct mempolicy *policy; struct folio *folio; /* * Fast-path: See if folio is already present in mapping to avoid * policy_lookup. */ folio = __filemap_get_folio(inode->i_mapping, index, FGP_LOCK | FGP_ACCESSED, 0); if (!IS_ERR(folio)) return folio; policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); folio = __filemap_get_folio_mpol(inode->i_mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_mask(inode->i_mapping), policy); mpol_cond_put(policy); return folio; } static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) { if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED) return KVM_FILTER_SHARED; return KVM_FILTER_PRIVATE; } static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, pgoff_t end, enum kvm_gfn_range_filter attr_filter) { bool flush = false, found_memslot = false; struct kvm_memory_slot *slot; struct kvm *kvm = f->kvm; unsigned long index; xa_for_each_range(&f->bindings, index, slot, start, end - 1) { pgoff_t pgoff = slot->gmem.pgoff; struct kvm_gfn_range gfn_range = { .start = slot->base_gfn + max(pgoff, start) - pgoff, .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, .slot = slot, .may_block = true, .attr_filter = attr_filter, }; if (!found_memslot) { found_memslot = true; KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_begin(kvm); } flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); } if (flush) kvm_flush_remote_tlbs(kvm); if (found_memslot) KVM_MMU_UNLOCK(kvm); } static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, pgoff_t end) { enum kvm_gfn_range_filter attr_filter; struct gmem_file *f; attr_filter = kvm_gmem_get_invalidate_filter(inode); kvm_gmem_for_each_file(f, inode->i_mapping) __kvm_gmem_invalidate_begin(f, start, end, attr_filter); } static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, pgoff_t end) { struct kvm *kvm = f->kvm; if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_end(kvm); KVM_MMU_UNLOCK(kvm); } } static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, pgoff_t end) { struct gmem_file *f; kvm_gmem_for_each_file(f, inode->i_mapping) __kvm_gmem_invalidate_end(f, start, end); } static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t start = offset >> PAGE_SHIFT; pgoff_t end = (offset + len) >> PAGE_SHIFT; /* * Bindings must be stable across invalidation to ensure the start+end * are balanced. */ filemap_invalidate_lock(inode->i_mapping); kvm_gmem_invalidate_begin(inode, start, end); truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); kvm_gmem_invalidate_end(inode, start, end); filemap_invalidate_unlock(inode->i_mapping); return 0; } static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) { struct address_space *mapping = inode->i_mapping; pgoff_t start, index, end; int r; /* Dedicated guest is immutable by default. */ if (offset + len > i_size_read(inode)) return -EINVAL; filemap_invalidate_lock_shared(mapping); start = offset >> PAGE_SHIFT; end = (offset + len) >> PAGE_SHIFT; r = 0; for (index = start; index < end; ) { struct folio *folio; if (signal_pending(current)) { r = -EINTR; break; } folio = kvm_gmem_get_folio(inode, index); if (IS_ERR(folio)) { r = PTR_ERR(folio); break; } index = folio_next_index(folio); folio_unlock(folio); folio_put(folio); /* 64-bit only, wrapping the index should be impossible. */ if (WARN_ON_ONCE(!index)) break; cond_resched(); } filemap_invalidate_unlock_shared(mapping); return r; } static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { int ret; if (!(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) return -EINVAL; if (mode & FALLOC_FL_PUNCH_HOLE) ret = kvm_gmem_punch_hole(file_inode(file), offset, len); else ret = kvm_gmem_allocate(file_inode(file), offset, len); if (!ret) file_modified(file); return ret; } static int kvm_gmem_release(struct inode *inode, struct file *file) { struct gmem_file *f = file->private_data; struct kvm_memory_slot *slot; struct kvm *kvm = f->kvm; unsigned long index; /* * Prevent concurrent attempts to *unbind* a memslot. This is the last * reference to the file and thus no new bindings can be created, but * dereferencing the slot for existing bindings needs to be protected * against memslot updates, specifically so that unbind doesn't race * and free the memslot (kvm_gmem_get_file() will return NULL). * * Since .release is called only when the reference count is zero, * after which file_ref_get() and get_file_active() fail, * kvm_gmem_get_pfn() cannot be using the file concurrently. * file_ref_put() provides a full barrier, and get_file_active() the * matching acquire barrier. */ mutex_lock(&kvm->slots_lock); filemap_invalidate_lock(inode->i_mapping); xa_for_each(&f->bindings, index, slot) WRITE_ONCE(slot->gmem.file, NULL); /* * All in-flight operations are gone and new bindings can be created. * Zap all SPTEs pointed at by this file. Do not free the backing * memory, as its lifetime is associated with the inode, not the file. */ __kvm_gmem_invalidate_begin(f, 0, -1ul, kvm_gmem_get_invalidate_filter(inode)); __kvm_gmem_invalidate_end(f, 0, -1ul); list_del(&f->entry); filemap_invalidate_unlock(inode->i_mapping); mutex_unlock(&kvm->slots_lock); xa_destroy(&f->bindings); kfree(f); kvm_put_kvm(kvm); return 0; } static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) { /* * Do not return slot->gmem.file if it has already been closed; * there might be some time between the last fput() and when * kvm_gmem_release() clears slot->gmem.file. */ return get_file_active(&slot->gmem.file); } DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T), kvm_gmem_get_file(slot), struct kvm_memory_slot *slot); static bool kvm_gmem_supports_mmap(struct inode *inode) { return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; } static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); struct folio *folio; vm_fault_t ret = VM_FAULT_LOCKED; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) return VM_FAULT_SIGBUS; folio = kvm_gmem_get_folio(inode, vmf->pgoff); if (IS_ERR(folio)) { if (PTR_ERR(folio) == -EAGAIN) return VM_FAULT_RETRY; return vmf_error(PTR_ERR(folio)); } if (WARN_ON_ONCE(folio_test_large(folio))) { ret = VM_FAULT_SIGBUS; goto out_folio; } if (!folio_test_uptodate(folio)) { clear_highpage(folio_page(folio, 0)); kvm_gmem_mark_prepared(folio); } vmf->page = folio_file_page(folio, vmf->pgoff); out_folio: if (ret != VM_FAULT_LOCKED) { folio_unlock(folio); folio_put(folio); } return ret; } #ifdef CONFIG_NUMA static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { struct inode *inode = file_inode(vma->vm_file); return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *pgoff) { struct inode *inode = file_inode(vma->vm_file); *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); /* * Return the memory policy for this index, or NULL if none is set. * * Returning NULL, e.g. instead of the current task's memory policy, is * important for the .get_policy kernel ABI: it indicates that no * explicit policy has been set via mbind() for this memory. The caller * can then replace NULL with the default memory policy instead of the * current task's memory policy. */ return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff); } #endif /* CONFIG_NUMA */ static const struct vm_operations_struct kvm_gmem_vm_ops = { .fault = kvm_gmem_fault_user_mapping, #ifdef CONFIG_NUMA .get_policy = kvm_gmem_get_policy, .set_policy = kvm_gmem_set_policy, #endif }; static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) { if (!kvm_gmem_supports_mmap(file_inode(file))) return -ENODEV; if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != (VM_SHARED | VM_MAYSHARE)) { return -EINVAL; } vma->vm_ops = &kvm_gmem_vm_ops; return 0; } static struct file_operations kvm_gmem_fops = { .mmap = kvm_gmem_mmap, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, }; static int kvm_gmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { WARN_ON_ONCE(1); return -EINVAL; } static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) { pgoff_t start, end; filemap_invalidate_lock_shared(mapping); start = folio->index; end = start + folio_nr_pages(folio); kvm_gmem_invalidate_begin(mapping->host, start, end); /* * Do not truncate the range, what action is taken in response to the * error is userspace's decision (assuming the architecture supports * gracefully handling memory errors). If/when the guest attempts to * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, * at which point KVM can either terminate the VM or propagate the * error to userspace. */ kvm_gmem_invalidate_end(mapping->host, start, end); filemap_invalidate_unlock_shared(mapping); return MF_DELAYED; } #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE static void kvm_gmem_free_folio(struct folio *folio) { struct page *page = folio_page(folio, 0); kvm_pfn_t pfn = page_to_pfn(page); int order = folio_order(folio); kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); } #endif static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE .free_folio = kvm_gmem_free_folio, #endif }; static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EINVAL; } static const struct inode_operations kvm_gmem_iops = { .setattr = kvm_gmem_setattr, }; bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) { return true; } static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { static const char *name = "[kvm-gmem]"; struct gmem_file *f; struct inode *inode; struct file *file; int fd, err; fd = get_unused_fd_flags(0); if (fd < 0) return fd; f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) { err = -ENOMEM; goto err_fd; } /* __fput() will take care of fops_put(). */ if (!fops_get(&kvm_gmem_fops)) { err = -ENOENT; goto err_gmem; } inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto err_fops; } inode->i_op = &kvm_gmem_iops; inode->i_mapping->a_ops = &kvm_gmem_aops; inode->i_mode |= S_IFREG; inode->i_size = size; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_inaccessible(inode->i_mapping); /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); GMEM_I(inode)->flags = flags; file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_inode; } file->f_flags |= O_LARGEFILE; file->private_data = f; kvm_get_kvm(kvm); f->kvm = kvm; xa_init(&f->bindings); list_add(&f->entry, &inode->i_mapping->i_private_list); fd_install(fd, file); return fd; err_inode: iput(inode); err_fops: fops_put(&kvm_gmem_fops); err_gmem: kfree(f); err_fd: put_unused_fd(fd); return err; } int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) { loff_t size = args->size; u64 flags = args->flags; if (flags & ~kvm_gmem_get_supported_flags(kvm)) return -EINVAL; if (size <= 0 || !PAGE_ALIGNED(size)) return -EINVAL; return __kvm_gmem_create(kvm, size, flags); } int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset) { loff_t size = slot->npages << PAGE_SHIFT; unsigned long start, end; struct gmem_file *f; struct inode *inode; struct file *file; int r = -EINVAL; BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); file = fget(fd); if (!file) return -EBADF; if (file->f_op != &kvm_gmem_fops) goto err; f = file->private_data; if (f->kvm != kvm) goto err; inode = file_inode(file); if (offset < 0 || !PAGE_ALIGNED(offset) || offset + size > i_size_read(inode)) goto err; filemap_invalidate_lock(inode->i_mapping); start = offset >> PAGE_SHIFT; end = start + slot->npages; if (!xa_empty(&f->bindings) && xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { filemap_invalidate_unlock(inode->i_mapping); goto err; } /* * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so * kvm_gmem_bind() must occur on a new memslot. Because the memslot * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file. */ WRITE_ONCE(slot->gmem.file, file); slot->gmem.pgoff = start; if (kvm_gmem_supports_mmap(inode)) slot->flags |= KVM_MEMSLOT_GMEM_ONLY; xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL); filemap_invalidate_unlock(inode->i_mapping); /* * Drop the reference to the file, even on success. The file pins KVM, * not the other way 'round. Active bindings are invalidated if the * file is closed before memslots are destroyed. */ r = 0; err: fput(file); return r; } static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f) { unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL); /* * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() * cannot see this memslot. */ WRITE_ONCE(slot->gmem.file, NULL); } void kvm_gmem_unbind(struct kvm_memory_slot *slot) { /* * Nothing to do if the underlying file was _already_ closed, as * kvm_gmem_release() invalidates and nullifies all bindings. */ if (!slot->gmem.file) return; CLASS(gmem_get_file, file)(slot); /* * However, if the file is _being_ closed, then the bindings need to be * removed as kvm_gmem_release() might not run until after the memslot * is freed. Note, modifying the bindings is safe even though the file * is dying as kvm_gmem_release() nullifies slot->gmem.file under * slots_lock, and only puts its reference to KVM after destroying all * bindings. I.e. reaching this point means kvm_gmem_release() hasn't * yet destroyed the bindings or freed the gmem_file, and can't do so * until the caller drops slots_lock. */ if (!file) { __kvm_gmem_unbind(slot, slot->gmem.file->private_data); return; } filemap_invalidate_lock(file->f_mapping); __kvm_gmem_unbind(slot, file->private_data); filemap_invalidate_unlock(file->f_mapping); } /* Returns a locked folio on success. */ static struct folio *__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, pgoff_t index, kvm_pfn_t *pfn, bool *is_prepared, int *max_order) { struct file *slot_file = READ_ONCE(slot->gmem.file); struct gmem_file *f = file->private_data; struct folio *folio; if (file != slot_file) { WARN_ON_ONCE(slot_file); return ERR_PTR(-EFAULT); } if (xa_load(&f->bindings, index) != slot) { WARN_ON_ONCE(xa_load(&f->bindings, index)); return ERR_PTR(-EIO); } folio = kvm_gmem_get_folio(file_inode(file), index); if (IS_ERR(folio)) return folio; if (folio_test_hwpoison(folio)) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-EHWPOISON); } *pfn = folio_file_pfn(folio, index); if (max_order) *max_order = 0; *is_prepared = folio_test_uptodate(folio); return folio; } int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, struct page **page, int *max_order) { pgoff_t index = kvm_gmem_get_index(slot, gfn); struct folio *folio; bool is_prepared = false; int r = 0; CLASS(gmem_get_file, file)(slot); if (!file) return -EFAULT; folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); if (IS_ERR(folio)) return PTR_ERR(folio); if (!is_prepared) r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); folio_unlock(folio); if (!r) *page = folio_file_page(folio, index); else folio_put(folio); return r; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { struct kvm_memory_slot *slot; void __user *p; int ret = 0, max_order; long i; lockdep_assert_held(&kvm->slots_lock); if (WARN_ON_ONCE(npages <= 0)) return -EINVAL; slot = gfn_to_memslot(kvm, start_gfn); if (!kvm_slot_has_gmem(slot)) return -EINVAL; CLASS(gmem_get_file, file)(slot); if (!file) return -EFAULT; filemap_invalidate_lock(file->f_mapping); npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); for (i = 0; i < npages; i += (1 << max_order)) { struct folio *folio; gfn_t gfn = start_gfn + i; pgoff_t index = kvm_gmem_get_index(slot, gfn); bool is_prepared = false; kvm_pfn_t pfn; if (signal_pending(current)) { ret = -EINTR; break; } folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; } if (is_prepared) { folio_unlock(folio); folio_put(folio); ret = -EEXIST; break; } folio_unlock(folio); WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || (npages - i) < (1 << max_order)); ret = -EINVAL; while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_MEMORY_ATTRIBUTE_PRIVATE)) { if (!max_order) goto put_folio_and_exit; max_order--; } p = src ? src + i * PAGE_SIZE : NULL; ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); if (!ret) kvm_gmem_mark_prepared(folio); put_folio_and_exit: folio_put(folio); if (ret) break; } filemap_invalidate_unlock(file->f_mapping); return ret && !i ? ret : i; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif static struct kmem_cache *kvm_gmem_inode_cachep; static void kvm_gmem_init_inode_once(void *__gi) { struct gmem_inode *gi = __gi; /* * Note! Don't initialize the inode with anything specific to the * guest_memfd instance, or that might be specific to how the inode is * used (from the VFS-layer's perspective). This hook is called only * during the initial slab allocation, i.e. only fields/state that are * idempotent across _all_ use of the inode _object_ can be initialized * at this time! */ inode_init_once(&gi->vfs_inode); } static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) { struct gmem_inode *gi; gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); if (!gi) return NULL; mpol_shared_policy_init(&gi->policy, NULL); gi->flags = 0; return &gi->vfs_inode; } static void kvm_gmem_destroy_inode(struct inode *inode) { mpol_free_shared_policy(&GMEM_I(inode)->policy); } static void kvm_gmem_free_inode(struct inode *inode) { kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode)); } static const struct super_operations kvm_gmem_super_operations = { .statfs = simple_statfs, .alloc_inode = kvm_gmem_alloc_inode, .destroy_inode = kvm_gmem_destroy_inode, .free_inode = kvm_gmem_free_inode, }; static int kvm_gmem_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx; if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC; fc->s_iflags |= SB_I_NODEV; ctx = fc->fs_private; ctx->ops = &kvm_gmem_super_operations; return 0; } static struct file_system_type kvm_gmem_fs = { .name = "guest_memfd", .init_fs_context = kvm_gmem_init_fs_context, .kill_sb = kill_anon_super, }; static int kvm_gmem_init_mount(void) { kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); if (IS_ERR(kvm_gmem_mnt)) return PTR_ERR(kvm_gmem_mnt); kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; return 0; } int kvm_gmem_init(struct module *module) { struct kmem_cache_args args = { .align = 0, .ctor = kvm_gmem_init_inode_once, }; int ret; kvm_gmem_fops.owner = module; kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", sizeof(struct gmem_inode), &args, SLAB_ACCOUNT); if (!kvm_gmem_inode_cachep) return -ENOMEM; ret = kvm_gmem_init_mount(); if (ret) { kmem_cache_destroy(kvm_gmem_inode_cachep); return ret; } return 0; } void kvm_gmem_exit(void) { kern_unmount(kvm_gmem_mnt); kvm_gmem_mnt = NULL; rcu_barrier(); kmem_cache_destroy(kvm_gmem_inode_cachep); } |
| 13 17 14 26 11 24 23 21 25 36 36 7 36 7 36 26 26 25 11 9 7 5 24 2 2 16 1 4 36 13 36 26 36 26 4 17 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/bitmap.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/export.h> #include <linux/hex.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include "kstrtox.h" /** * bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap * * @ubuf: pointer to user buffer containing string. * @ulen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. */ int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { char *buf; int ret; buf = memdup_user_nul(ubuf, ulen); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_parse(buf, UINT_MAX, maskp, nmaskbits); kfree(buf); return ret; } EXPORT_SYMBOL(bitmap_parse_user); /** * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string * @list: indicates whether the bitmap must be list * @buf: page aligned buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * * Output format is a comma-separated list of decimal numbers and * ranges if list is specified or hex digits grouped into comma-separated * sets of 8 digits/set. Returns the number of characters written to buf. * * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned * area and that sufficient storage remains at @buf to accommodate the * bitmap_print_to_pagebuf() output. Returns the number of characters * actually printed to @buf, excluding terminating '\0'. */ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits) { ptrdiff_t len = PAGE_SIZE - offset_in_page(buf); return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) : scnprintf(buf, len, "%*pb\n", nmaskbits, maskp); } EXPORT_SYMBOL(bitmap_print_to_pagebuf); /** * bitmap_print_to_buf - convert bitmap to list or hex format ASCII string * @list: indicates whether the bitmap must be list * true: print in decimal list format * false: print in hexadecimal bitmask format * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print */ static int bitmap_print_to_buf(bool list, char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { const char *fmt = list ? "%*pbl\n" : "%*pb\n"; ssize_t size; void *data; data = kasprintf(GFP_KERNEL, fmt, nmaskbits, maskp); if (!data) return -ENOMEM; size = memory_read_from_buffer(buf, count, &off, data, strlen(data) + 1); kfree(data); return size; } /** * bitmap_print_bitmask_to_buf - convert bitmap to hex bitmask format ASCII string * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print * * The bitmap_print_to_pagebuf() is used indirectly via its cpumap wrapper * cpumap_print_to_pagebuf() or directly by drivers to export hexadecimal * bitmask and decimal list to userspace by sysfs ABI. * Drivers might be using a normal attribute for this kind of ABIs. A * normal attribute typically has show entry as below:: * * static ssize_t example_attribute_show(struct device *dev, * struct device_attribute *attr, char *buf) * { * ... * return bitmap_print_to_pagebuf(true, buf, &mask, nr_trig_max); * } * * show entry of attribute has no offset and count parameters and this * means the file is limited to one page only. * bitmap_print_to_pagebuf() API works terribly well for this kind of * normal attribute with buf parameter and without offset, count:: * * bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, * int nmaskbits) * { * } * * The problem is once we have a large bitmap, we have a chance to get a * bitmask or list more than one page. Especially for list, it could be * as complex as 0,3,5,7,9,... We have no simple way to know it exact size. * It turns out bin_attribute is a way to break this limit. bin_attribute * has show entry as below:: * * static ssize_t * example_bin_attribute_show(struct file *filp, struct kobject *kobj, * struct bin_attribute *attr, char *buf, * loff_t offset, size_t count) * { * ... * } * * With the new offset and count parameters, this makes sysfs ABI be able * to support file size more than one page. For example, offset could be * >= 4096. * bitmap_print_bitmask_to_buf(), bitmap_print_list_to_buf() wit their * cpumap wrapper cpumap_print_bitmask_to_buf(), cpumap_print_list_to_buf() * make those drivers be able to support large bitmask and list after they * move to use bin_attribute. In result, we have to pass the corresponding * parameters such as off, count from bin_attribute show entry to this API. * * The role of cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf() * is similar with cpumap_print_to_pagebuf(), the difference is that * bitmap_print_to_pagebuf() mainly serves sysfs attribute with the assumption * the destination buffer is exactly one page and won't be more than one page. * cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf(), on the other * hand, mainly serves bin_attribute which doesn't work with exact one page, * and it can break the size limit of converted decimal list and hexadecimal * bitmask. * * WARNING! * * This function is not a replacement for sprintf() or bitmap_print_to_pagebuf(). * It is intended to workaround sysfs limitations discussed above and should be * used carefully in general case for the following reasons: * * - Time complexity is O(nbits^2/count), comparing to O(nbits) for snprintf(). * - Memory complexity is O(nbits), comparing to O(1) for snprintf(). * - @off and @count are NOT offset and number of bits to print. * - If printing part of bitmap as list, the resulting string is not a correct * list representation of bitmap. Particularly, some bits within or out of * related interval may be erroneously set or unset. The format of the string * may be broken, so bitmap_parselist-like parser may fail parsing it. * - If printing the whole bitmap as list by parts, user must ensure the order * of calls of the function such that the offset is incremented linearly. * - If printing the whole bitmap as list by parts, user must keep bitmap * unchanged between the very first and very last call. Otherwise concatenated * result may be incorrect, and format may be broken. * * Returns the number of characters actually printed to @buf */ int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { return bitmap_print_to_buf(false, buf, maskp, nmaskbits, off, count); } EXPORT_SYMBOL(bitmap_print_bitmask_to_buf); /** * bitmap_print_list_to_buf - convert bitmap to decimal list format ASCII string * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print * * Everything is same with the above bitmap_print_bitmask_to_buf() except * the print format. */ int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { return bitmap_print_to_buf(true, buf, maskp, nmaskbits, off, count); } EXPORT_SYMBOL(bitmap_print_list_to_buf); /* * Region 9-38:4/10 describes the following bitmap structure: * 0 9 12 18 38 N * .........****......****......****.................. * ^ ^ ^ ^ ^ * start off group_len end nbits */ struct region { unsigned int start; unsigned int off; unsigned int group_len; unsigned int end; unsigned int nbits; }; static void bitmap_set_region(const struct region *r, unsigned long *bitmap) { unsigned int start; for (start = r->start; start <= r->end; start += r->group_len) bitmap_set(bitmap, start, min(r->end - start + 1, r->off)); } static int bitmap_check_region(const struct region *r) { if (r->start > r->end || r->group_len == 0 || r->off > r->group_len) return -EINVAL; if (r->end >= r->nbits) return -ERANGE; return 0; } static const char *bitmap_getnum(const char *str, unsigned int *num, unsigned int lastbit) { unsigned long long n; unsigned int len; if (str[0] == 'N') { *num = lastbit; return str + 1; } len = _parse_integer(str, 10, &n); if (!len) return ERR_PTR(-EINVAL); if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n) return ERR_PTR(-EOVERFLOW); *num = n; return str + len; } static inline bool end_of_str(char c) { return c == '\0' || c == '\n'; } static inline bool __end_of_region(char c) { return isspace(c) || c == ','; } static inline bool end_of_region(char c) { return __end_of_region(c) || end_of_str(c); } /* * The format allows commas and whitespaces at the beginning * of the region. */ static const char *bitmap_find_region(const char *str) { while (__end_of_region(*str)) str++; return end_of_str(*str) ? NULL : str; } static const char *bitmap_find_region_reverse(const char *start, const char *end) { while (start <= end && __end_of_region(*end)) end--; return end; } static const char *bitmap_parse_region(const char *str, struct region *r) { unsigned int lastbit = r->nbits - 1; if (!strncasecmp(str, "all", 3)) { r->start = 0; r->end = lastbit; str += 3; goto check_pattern; } str = bitmap_getnum(str, &r->start, lastbit); if (IS_ERR(str)) return str; if (end_of_region(*str)) goto no_end; if (*str != '-') return ERR_PTR(-EINVAL); str = bitmap_getnum(str + 1, &r->end, lastbit); if (IS_ERR(str)) return str; check_pattern: if (end_of_region(*str)) goto no_pattern; if (*str != ':') return ERR_PTR(-EINVAL); str = bitmap_getnum(str + 1, &r->off, lastbit); if (IS_ERR(str)) return str; if (*str != '/') return ERR_PTR(-EINVAL); return bitmap_getnum(str + 1, &r->group_len, lastbit); no_end: r->end = r->start; no_pattern: r->off = r->end + 1; r->group_len = r->end + 1; return end_of_str(*str) ? NULL : str; } /** * bitmap_parselist - convert list format ASCII string to bitmap * @buf: read user string from this buffer; must be terminated * with a \0 or \n. * @maskp: write resulting mask here * @nmaskbits: number of bits in mask to be written * * Input format is a comma-separated list of decimal numbers and * ranges. Consecutively set bits are shown as two hyphen-separated * decimal numbers, the smallest and largest bit numbers set in * the range. * Optionally each range can be postfixed to denote that only parts of it * should be set. The range will divided to groups of specific size. * From each group will be used only defined amount of bits. * Syntax: range:used_size/group_size * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769 * The value 'N' can be used as a dynamically substituted token for the * maximum allowed value; i.e (nmaskbits - 1). Keep in mind that it is * dynamic, so if system changes cause the bitmap width to change, such * as more cores in a CPU list, then any ranges using N will also change. * * Returns: 0 on success, -errno on invalid input strings. Error values: * * - ``-EINVAL``: wrong region format * - ``-EINVAL``: invalid character in string * - ``-ERANGE``: bit number specified too large for mask * - ``-EOVERFLOW``: integer overflow in the input parameters */ int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits) { struct region r; long ret; r.nbits = nmaskbits; bitmap_zero(maskp, r.nbits); while (buf) { buf = bitmap_find_region(buf); if (buf == NULL) return 0; buf = bitmap_parse_region(buf, &r); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_check_region(&r); if (ret) return ret; bitmap_set_region(&r, maskp); } return 0; } EXPORT_SYMBOL(bitmap_parselist); /** * bitmap_parselist_user() - convert user buffer's list format ASCII * string to bitmap * * @ubuf: pointer to user buffer containing string. * @ulen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. * * Wrapper for bitmap_parselist(), providing it with user buffer. */ int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { char *buf; int ret; buf = memdup_user_nul(ubuf, ulen); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_parselist(buf, maskp, nmaskbits); kfree(buf); return ret; } EXPORT_SYMBOL(bitmap_parselist_user); static const char *bitmap_get_x32_reverse(const char *start, const char *end, u32 *num) { u32 ret = 0; int c, i; for (i = 0; i < 32; i += 4) { c = hex_to_bin(*end--); if (c < 0) return ERR_PTR(-EINVAL); ret |= c << i; if (start > end || __end_of_region(*end)) goto out; } if (hex_to_bin(*end--) >= 0) return ERR_PTR(-EOVERFLOW); out: *num = ret; return end; } /** * bitmap_parse - convert an ASCII hex string into a bitmap. * @start: pointer to buffer containing string. * @buflen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0 or \n. In that case, * UINT_MAX may be provided instead of string length. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. * * Commas group hex digits into chunks. Each chunk defines exactly 32 * bits of the resultant bitmask. No chunk may specify a value larger * than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value * then leading 0-bits are prepended. %-EINVAL is returned for illegal * characters. Grouping such as "1,,5", ",44", "," or "" is allowed. * Leading, embedded and trailing whitespace accepted. */ int bitmap_parse(const char *start, unsigned int buflen, unsigned long *maskp, int nmaskbits) { const char *end = strnchrnul(start, buflen, '\n') - 1; int chunks = BITS_TO_U32(nmaskbits); u32 *bitmap = (u32 *)maskp; int unset_bit; int chunk; for (chunk = 0; ; chunk++) { end = bitmap_find_region_reverse(start, end); if (start > end) break; if (!chunks--) return -EOVERFLOW; #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) end = bitmap_get_x32_reverse(start, end, &bitmap[chunk ^ 1]); #else end = bitmap_get_x32_reverse(start, end, &bitmap[chunk]); #endif if (IS_ERR(end)) return PTR_ERR(end); } unset_bit = (BITS_TO_U32(nmaskbits) - chunks) * 32; if (unset_bit < nmaskbits) { bitmap_clear(maskp, unset_bit, nmaskbits - unset_bit); return 0; } if (find_next_bit(maskp, unset_bit, nmaskbits) != unset_bit) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(bitmap_parse); |
| 7 7 7 7 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | // SPDX-License-Identifier: GPL-2.0 /* * Based on the fbdev code in drivers/video/fbdev/core/fb_cmdline: * * Copyright (C) 2014 Intel Corp * Copyright (C) 1994 Martin Schaller * * 2001 - Documented with DocBook * - Brad Douglas <brad@neruo.com> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. * * Authors: * Daniel Vetter <daniel.vetter@ffwll.ch> */ #include <linux/fb.h> /* for FB_MAX */ #include <linux/init.h> #include <video/cmdline.h> /* * FB_MAX is the maximum number of framebuffer devices and also * the maximum number of video= parameters. Although not directly * related to each other, it makes sense to keep it that way. */ static const char *video_options[FB_MAX] __read_mostly; static const char *video_option __read_mostly; static int video_of_only __read_mostly; static const char *__video_get_option_string(const char *name) { const char *options = NULL; size_t name_len = 0; if (name) name_len = strlen(name); if (name_len) { unsigned int i; const char *opt; for (i = 0; i < ARRAY_SIZE(video_options); ++i) { if (!video_options[i]) continue; if (video_options[i][0] == '\0') continue; opt = video_options[i]; if (!strncmp(opt, name, name_len) && opt[name_len] == ':') options = opt + name_len + 1; } } /* No match, return global options */ if (!options) options = video_option; return options; } /** * video_get_options - get kernel boot parameters * @name: name of the output as it would appear in the boot parameter * line (video=<name>:<options>) * * Looks up the video= options for the given name. Names are connector * names with DRM, or driver names with fbdev. If no video option for * the name has been specified, the function returns the global video= * setting. A @name of NULL always returns the global video setting. * * Returns: * The string of video options for the given name, or NULL if no video * option has been specified. */ const char *video_get_options(const char *name) { return __video_get_option_string(name); } EXPORT_SYMBOL(video_get_options); #if IS_ENABLED(CONFIG_FB_CORE) bool __video_get_options(const char *name, const char **options, bool is_of) { bool enabled = true; const char *opt = NULL; if (video_of_only && !is_of) enabled = false; opt = __video_get_option_string(name); if (options) *options = opt; return enabled; } EXPORT_SYMBOL(__video_get_options); #endif /* * Process command line options for video adapters. This function is * a __setup and __init function. It only stores the options. Drivers * have to call video_get_options() as necessary. */ static int __init video_setup(char *options) { if (!options || !*options) goto out; if (!strncmp(options, "ofonly", 6)) { video_of_only = true; goto out; } if (strchr(options, ':')) { /* named */ size_t i; for (i = 0; i < ARRAY_SIZE(video_options); i++) { if (!video_options[i]) { video_options[i] = options; break; } } } else { /* global */ video_option = options; } out: return 1; } __setup("video=", video_setup); |
| 25 25 25 25 7 25 25 14 14 13 13 25 25 25 14 13 25 25 25 25 25 25 25 4 25 25 25 25 24 6 6 6 24 13 13 13 1 1 1 1 1 1 1 1 13 1 1 1 13 13 13 5 14 5 5 5 5 5 5 5 24 24 24 24 24 7 7 7 2 5 5 2 2 2 2 2 2 2 13 13 12 13 2 2 2 13 12 13 12 12 12 12 12 12 25 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 3 5 5 5 5 24 7 24 24 24 24 24 24 24 24 24 25 24 18 4 25 25 23 24 24 24 24 23 24 24 24 24 24 24 4 24 24 24 10 10 10 19 19 18 19 12 19 19 19 19 12 19 24 24 24 21 22 22 20 12 19 11 18 3 1 2 29 31 3 4 27 1 32 31 32 31 28 3 28 22 2 18 11 10 13 10 9 21 32 15 19 4 18 15 4 19 2 2 2 2 2 1 1 2 14 7 13 1 6 2 6 2 6 5 5 4 4 4 2 2 2 2 4 4 4 2 2 2 3 1 1 1 1 4 58 55 55 58 57 54 50 47 46 49 47 45 107 107 58 50 103 102 101 100 101 15 15 101 103 12 8 10 9 6 4 117 118 115 109 107 3 109 105 103 4 96 10 10 93 71 87 101 3 5 3 13 11 14 6 8 15 22 30 1 30 9 22 21 21 21 9 5 3 1 1 2 2 26 19 7 7 4 3 1 5 2 1 1 1 1 22 4 2 3 3 1 7 7 4 4 4 7 7 7 7 1 1 1 21 21 21 21 3 50 21 20 4 4 2 1 1 1 1 1 4 2 5 3 5 4 3 2 3 11 10 8 6 5 11 11 11 11 11 6 3 8 6 2 4 7 11 6 6 6 6 5 4 3 2 7 7 4 2 5 7 18 14 4 12 2 12 1 11 1 11 2 11 5 9 3 7 3 56 56 53 53 3 1 1 1 1 50 48 47 48 47 48 48 48 2 47 6 42 2 41 3 39 3 39 2 11 1 37 37 36 3 18 22 55 82 2 81 79 79 64 75 71 7 4 75 62 60 58 56 22 36 5 4 36 36 36 19 18 6 6 19 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 1 2 2 2 2 2 2 2 3 3 3 3 3 3 4 3 3 1 1 1 1 10 7 10 3 3 3 1 2 1 2 1 5 2 2 5 1 3 1 1 2 3 3 3 2 1 7 7 6 5 5 7 4 4 3 2 2 2 2 2 2 2 2 2 2 2 3 2 73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 | /* Connection tracking via netlink socket. Allows for user space * protocol helpers and general trouble making from userspace. * * (C) 2001 by Jay Schulist <jschlst@samba.org> * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org> * (C) 2003 by Patrick Mchardy <kaber@trash.net> * (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org> * * Initial connection tracking via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) * * Further development of this code funded by Astaro AG (http://www.astaro.com) * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/rculist.h> #include <linux/rculist_nulls.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/security.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/siphash.h> #include <linux/netfilter.h> #include <net/netlink.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_synproxy.h> #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #endif #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include "nf_internals.h" MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("List and change connection tracking table"); struct ctnetlink_list_dump_ctx { unsigned long last_id; unsigned int cpu; bool done; }; static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *l4proto) { int ret = 0; struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum)) goto nla_put_failure; if (likely(l4proto->tuple_to_nlattr)) ret = l4proto->tuple_to_nlattr(skb, tuple); nla_nest_end(skb, nest_parms); return ret; nla_put_failure: return -1; } static int ipv4_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) return -EMSGSIZE; return 0; } static int ipv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) || nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6)) return -EMSGSIZE; return 0; } static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { int ret = 0; struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, CTA_TUPLE_IP); if (!nest_parms) goto nla_put_failure; switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_tuple_to_nlattr(skb, tuple); break; case NFPROTO_IPV6: ret = ipv6_tuple_to_nlattr(skb, tuple); break; } nla_nest_end(skb, nest_parms); return ret; nla_put_failure: return -1; } static int ctnetlink_dump_tuples(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_l4proto *l4proto; int ret; rcu_read_lock(); ret = ctnetlink_dump_tuples_ip(skb, tuple); if (ret >= 0) { l4proto = nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); } rcu_read_unlock(); return ret; } static int ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype, const struct nf_conntrack_zone *zone, int dir) { if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir) return 0; if (nla_put_be16(skb, attrtype, htons(zone->id))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct, bool skip_zero) { long timeout; if (nf_ct_is_confirmed(ct)) timeout = nf_ct_expires(ct) / HZ; else timeout = ct->timeout / HZ; if (skip_zero && timeout == 0) return 0; if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct, bool destroy) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *nest_proto; int ret; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->to_nlattr) return 0; nest_proto = nla_nest_start(skb, CTA_PROTOINFO); if (!nest_proto) goto nla_put_failure; ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy); nla_nest_end(skb, nest_proto); return ret; nla_put_failure: return -1; } static int ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_helper; const struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_helper *helper; if (!help) return 0; rcu_read_lock(); helper = rcu_dereference(help->helper); if (!helper) goto out; nest_helper = nla_nest_start(skb, CTA_HELP); if (!nest_helper) goto nla_put_failure; if (nla_put_string(skb, CTA_HELP_NAME, helper->name)) goto nla_put_failure; if (helper->to_nlattr) helper->to_nlattr(skb, ct); nla_nest_end(skb, nest_helper); out: rcu_read_unlock(); return 0; nla_put_failure: rcu_read_unlock(); return -1; } static int dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct, enum ip_conntrack_dir dir, int type) { enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; struct nf_conn_counter *counter = acct->counter; struct nlattr *nest_count; u64 pkts, bytes; if (type == IPCTNL_MSG_CT_GET_CTRZERO) { pkts = atomic64_xchg(&counter[dir].packets, 0); bytes = atomic64_xchg(&counter[dir].bytes, 0); } else { pkts = atomic64_read(&counter[dir].packets); bytes = atomic64_read(&counter[dir].bytes); } nest_count = nla_nest_start(skb, attr); if (!nest_count) goto nla_put_failure; if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts), CTA_COUNTERS_PAD) || nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes), CTA_COUNTERS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest_count); return 0; nla_put_failure: return -1; } static int ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type) { struct nf_conn_acct *acct = nf_conn_acct_find(ct); if (!acct) return 0; if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0) return -1; if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0) return -1; return 0; } static int ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_count; const struct nf_conn_tstamp *tstamp; tstamp = nf_conn_tstamp_find(ct); if (!tstamp) return 0; nest_count = nla_nest_start(skb, CTA_TIMESTAMP); if (!nest_count) goto nla_put_failure; if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start), CTA_TIMESTAMP_PAD) || (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP, cpu_to_be64(tstamp->stop), CTA_TIMESTAMP_PAD))) goto nla_put_failure; nla_nest_end(skb, nest_count); return 0; nla_put_failure: return -1; } #ifdef CONFIG_NF_CONNTRACK_MARK static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct, bool dump) { u32 mark = READ_ONCE(ct->mark); if (!mark && !dump) return 0; if (nla_put_be32(skb, CTA_MARK, htonl(mark))) goto nla_put_failure; return 0; nla_put_failure: return -1; } #else #define ctnetlink_dump_mark(a, b, c) (0) #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_secctx; struct lsm_context ctx; int ret; ret = security_secid_to_secctx(ct->secmark, &ctx); if (ret < 0) return 0; ret = -1; nest_secctx = nla_nest_start(skb, CTA_SECCTX); if (!nest_secctx) goto nla_put_failure; if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context)) goto nla_put_failure; nla_nest_end(skb, nest_secctx); ret = 0; nla_put_failure: security_release_secctx(&ctx); return ret; } #else #define ctnetlink_dump_secctx(a, b) (0) #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); if (e) { u64 ts = local64_read(&e->timestamp); if (ts) return nla_put_be64(skb, CTA_TIMESTAMP_EVENT, cpu_to_be64(ts), CTA_TIMESTAMP_PAD); } #endif return 0; } static inline int ctnetlink_label_size(const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); if (!labels) return 0; return nla_total_size(sizeof(labels->bits)); } #endif static int ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); unsigned int i; if (!labels) return 0; i = 0; do { if (labels->bits[i] != 0) return nla_put(skb, CTA_LABELS, sizeof(labels->bits), labels->bits); i++; } while (i < ARRAY_SIZE(labels->bits)); return 0; } #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) static int ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_parms; if (!(ct->status & IPS_EXPECTED)) return 0; nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, type); if (!nest_parms) goto nla_put_failure; if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS, htonl(seq->correction_pos)) || nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE, htonl(seq->offset_before)) || nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER, htonl(seq->offset_after))) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *seq; if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; spin_lock_bh(&ct->lock); seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) goto err; seq = &seqadj->seq[IP_CT_DIR_REPLY]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) goto err; spin_unlock_bh(&ct->lock); return 0; err: spin_unlock_bh(&ct->lock); return -1; } static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); struct nlattr *nest_parms; if (!synproxy) return 0; nest_parms = nla_nest_start(skb, CTA_SYNPROXY); if (!nest_parms) goto nla_put_failure; if (nla_put_be32(skb, CTA_SYNPROXY_ISN, htonl(synproxy->isn)) || nla_put_be32(skb, CTA_SYNPROXY_ITS, htonl(synproxy->its)) || nla_put_be32(skb, CTA_SYNPROXY_TSOFF, htonl(synproxy->tsoff))) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { __be32 id = (__force __be32)nf_ct_get_id(ct); if (nla_put_be32(skb, CTA_ID, id)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use)))) goto nla_put_failure; return 0; nla_put_failure: return -1; } /* all these functions access ct->ext. Caller must either hold a reference * on ct or prevent its deletion by holding either the bucket spinlock or * pcpu dying list lock. */ static int ctnetlink_dump_extinfo(struct sk_buff *skb, struct nf_conn *ct, u32 type) { if (ctnetlink_dump_acct(skb, ct, type) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_labels(skb, ct) < 0 || ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || ctnetlink_dump_ct_synproxy(skb, ct) < 0) return -1; return 0; } static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) { if (ctnetlink_dump_status(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct, true) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0) return -1; if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && (ctnetlink_dump_timeout(skb, ct, false) < 0 || ctnetlink_dump_protoinfo(skb, ct, false) < 0)) return -1; return 0; } static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nf_conn *ct, bool extinfo, unsigned int flags) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; struct nlattr *nest_parms; unsigned int event; if (portid) flags |= NLM_F_MULTI; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct), NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_ORIG) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_REPL) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_info(skb, ct) < 0) goto nla_put_failure; if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = { [CTA_IP_V4_SRC] = { .type = NLA_U32 }, [CTA_IP_V4_DST] = { .type = NLA_U32 }, [CTA_IP_V6_SRC] = { .len = sizeof(__be32) * 4 }, [CTA_IP_V6_DST] = { .len = sizeof(__be32) * 4 }, }; #if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) static size_t ctnetlink_proto_size(const struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; size_t len, len4 = 0; len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1); len *= 3u; /* ORIG, REPLY, MASTER */ l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); len += l4proto->nlattr_size; if (l4proto->nlattr_tuple_size) { len4 = l4proto->nlattr_tuple_size(); len4 *= 3u; /* ORIG, REPLY, MASTER */ } return len + len4; } static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT)) return 0; return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */ + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */ + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */ ; } static inline int ctnetlink_secctx_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_SECMARK int ret; ret = security_secid_to_secctx(ct->secmark, NULL); if (ret < 0) return 0; return nla_total_size(0) /* CTA_SECCTX */ + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */ #else return 0; #endif } static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) return 0; return nla_total_size(0) + 2 * nla_total_size_64bit(sizeof(uint64_t)); #else return 0; #endif } #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) { return NLMSG_ALIGN(sizeof(struct nfgenmsg)) + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + ctnetlink_acct_size(ct) + ctnetlink_timestamp_size(ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + nla_total_size(0) /* CTA_PROTOINFO */ + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) #if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ #endif #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */ #endif ; } static int ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) { const struct nf_conntrack_zone *zone; struct net *net; struct nlmsghdr *nlh; struct nlattr *nest_parms; struct nf_conn *ct = item->ct; struct sk_buff *skb; unsigned int type; unsigned int flags = 0, group; int err; if (events & (1 << IPCT_DESTROY)) { type = IPCTNL_MSG_CT_DELETE; group = NFNLGRP_CONNTRACK_DESTROY; } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) { type = IPCTNL_MSG_CT_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; group = NFNLGRP_CONNTRACK_NEW; } else if (events) { type = IPCTNL_MSG_CT_NEW; group = NFNLGRP_CONNTRACK_UPDATE; } else return 0; net = nf_ct_net(ct); if (!item->report && !nfnetlink_has_listeners(net, group)) return 0; skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC); if (skb == NULL) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type); nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct), NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_ORIG) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_REPL) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_DESTROY)) { if (ctnetlink_dump_timeout(skb, ct, true) < 0) goto nla_put_failure; if (ctnetlink_dump_acct(skb, ct, type) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0 || ctnetlink_dump_protoinfo(skb, ct, true) < 0) goto nla_put_failure; } else { if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; if (events & (1 << IPCT_PROTOINFO) && ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if ((events & (1 << IPCT_HELPER) || nfct_help(ct)) && ctnetlink_dump_helpinfo(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_SECMARK if ((events & (1 << IPCT_SECMARK) || ct->secmark) && ctnetlink_dump_secctx(skb, ct) < 0) goto nla_put_failure; #endif if (events & (1 << IPCT_LABEL) && ctnetlink_dump_labels(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_RELATED) && ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_SEQADJ) && ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_SYNPROXY) && ctnetlink_dump_ct_synproxy(skb, ct) < 0) goto nla_put_failure; } #ifdef CONFIG_NF_CONNTRACK_MARK if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK))) goto nla_put_failure; #endif if (ctnetlink_dump_event_timestamp(skb, ct)) goto nla_put_failure; nlmsg_end(skb, nlh); err = nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); if (err == -ENOBUFS || err == -EAGAIN) return -ENOBUFS; return 0; nla_put_failure: nlmsg_cancel(skb, nlh); nlmsg_failure: kfree_skb(skb); errout: if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0) return -ENOBUFS; return 0; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ static int ctnetlink_done(struct netlink_callback *cb) { kfree(cb->data); return 0; } struct ctnetlink_filter_u32 { u32 val; u32 mask; }; struct ctnetlink_filter { u8 family; bool zone_filter; u_int32_t orig_flags; u_int32_t reply_flags; struct nf_conntrack_tuple orig; struct nf_conntrack_tuple reply; struct nf_conntrack_zone zone; struct ctnetlink_filter_u32 mark; struct ctnetlink_filter_u32 status; }; static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { [CTA_FILTER_ORIG_FLAGS] = { .type = NLA_U32 }, [CTA_FILTER_REPLY_FLAGS] = { .type = NLA_U32 }, }; static int ctnetlink_parse_filter(const struct nlattr *attr, struct ctnetlink_filter *filter) { struct nlattr *tb[CTA_FILTER_MAX + 1]; int ret = 0; ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy, NULL); if (ret) return ret; if (tb[CTA_FILTER_ORIG_FLAGS]) { filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]); if (filter->orig_flags & ~CTA_FILTER_F_ALL) return -EOPNOTSUPP; } if (tb[CTA_FILTER_REPLY_FLAGS]) { filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]); if (filter->reply_flags & ~CTA_FILTER_F_ALL) return -EOPNOTSUPP; } return 0; } static int ctnetlink_parse_zone(const struct nlattr *attr, struct nf_conntrack_zone *zone); static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, u32 type, u_int8_t l3num, struct nf_conntrack_zone *zone, u_int32_t flags); static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark, const struct nlattr * const cda[]) { #ifdef CONFIG_NF_CONNTRACK_MARK if (cda[CTA_MARK]) { mark->val = ntohl(nla_get_be32(cda[CTA_MARK])); if (cda[CTA_MARK_MASK]) mark->mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); else mark->mask = 0xffffffff; } else if (cda[CTA_MARK_MASK]) { return -EINVAL; } #endif return 0; } static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status, const struct nlattr * const cda[]) { if (cda[CTA_STATUS]) { status->val = ntohl(nla_get_be32(cda[CTA_STATUS])); if (cda[CTA_STATUS_MASK]) status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK])); else status->mask = status->val; /* status->val == 0? always true, else always false. */ if (status->mask == 0) return -EINVAL; } else if (cda[CTA_STATUS_MASK]) { return -EINVAL; } /* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */ BUILD_BUG_ON(__IPS_MAX_BIT >= 32); return 0; } static struct ctnetlink_filter * ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { struct ctnetlink_filter *filter; int err; #ifndef CONFIG_NF_CONNTRACK_MARK if (cda[CTA_MARK] || cda[CTA_MARK_MASK]) return ERR_PTR(-EOPNOTSUPP); #endif filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (filter == NULL) return ERR_PTR(-ENOMEM); filter->family = family; err = ctnetlink_filter_parse_mark(&filter->mark, cda); if (err) goto err_filter; err = ctnetlink_filter_parse_status(&filter->status, cda); if (err) goto err_filter; if (cda[CTA_ZONE]) { err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); if (err < 0) goto err_filter; filter->zone_filter = true; } if (!cda[CTA_FILTER]) return filter; err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); if (err < 0) goto err_filter; if (filter->orig_flags) { if (!cda[CTA_TUPLE_ORIG]) { err = -EINVAL; goto err_filter; } err = ctnetlink_parse_tuple_filter(cda, &filter->orig, CTA_TUPLE_ORIG, filter->family, &filter->zone, filter->orig_flags); if (err < 0) goto err_filter; } if (filter->reply_flags) { if (!cda[CTA_TUPLE_REPLY]) { err = -EINVAL; goto err_filter; } err = ctnetlink_parse_tuple_filter(cda, &filter->reply, CTA_TUPLE_REPLY, filter->family, &filter->zone, filter->reply_flags); if (err < 0) goto err_filter; } return filter; err_filter: kfree(filter); return ERR_PTR(err); } static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) { return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE]; } static int ctnetlink_start(struct netlink_callback *cb) { const struct nlattr * const *cda = cb->data; struct ctnetlink_filter *filter = NULL; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u8 family = nfmsg->nfgen_family; if (ctnetlink_needs_filter(family, cda)) { filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); } cb->data = filter; return 0; } static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple, struct nf_conntrack_tuple *ct_tuple, u_int32_t flags, int family) { switch (family) { case NFPROTO_IPV4: if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && filter_tuple->src.u3.ip != ct_tuple->src.u3.ip) return 0; if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip) return 0; break; case NFPROTO_IPV6: if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && !ipv6_addr_cmp(&filter_tuple->src.u3.in6, &ct_tuple->src.u3.in6)) return 0; if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && !ipv6_addr_cmp(&filter_tuple->dst.u3.in6, &ct_tuple->dst.u3.in6)) return 0; break; } if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) && filter_tuple->dst.protonum != ct_tuple->dst.protonum) return 0; switch (ct_tuple->dst.protonum) { case IPPROTO_TCP: case IPPROTO_UDP: if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) && filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) && filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port) return 0; break; case IPPROTO_ICMP: if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) && filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) && filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) && filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) return 0; break; case IPPROTO_ICMPV6: if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) && filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) && filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) && filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) return 0; break; } return 1; } static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; struct nf_conntrack_tuple *tuple; u32 status; if (filter == NULL) goto out; /* Match entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then match everything. */ if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; if (filter->zone_filter && !nf_ct_zone_equal_any(ct, &filter->zone)) goto ignore_entry; if (filter->orig_flags) { tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL); if (!ctnetlink_filter_match_tuple(&filter->orig, tuple, filter->orig_flags, filter->family)) goto ignore_entry; } if (filter->reply_flags) { tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY); if (!ctnetlink_filter_match_tuple(&filter->reply, tuple, filter->reply_flags, filter->family)) goto ignore_entry; } #ifdef CONFIG_NF_CONNTRACK_MARK if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val) goto ignore_entry; #endif status = (u32)READ_ONCE(ct->status); if ((status & filter->status.mask) != filter->status.val) goto ignore_entry; out: return 1; ignore_entry: return 0; } static unsigned long ctnetlink_get_id(const struct nf_conn *ct) { unsigned long id = nf_ct_get_id(ct); return id ? id : 1; } static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); unsigned long last_id = cb->args[1]; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nf_conn *nf_ct_evict[8]; struct nf_conn *ct; int res, i; spinlock_t *lockp; i = 0; local_bh_disable(); for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: while (i) { i--; if (nf_ct_should_gc(nf_ct_evict[i])) nf_ct_kill(nf_ct_evict[i]); nf_ct_put(nf_ct_evict[i]); } lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); if (cb->args[0] >= nf_conntrack_htable_size) { spin_unlock(lockp); goto out; } hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(ct)) { /* need to defer nf_ct_kill() until lock is released */ if (i < ARRAY_SIZE(nf_ct_evict) && refcount_inc_not_zero(&ct->ct_general.use)) nf_ct_evict[i++] = ct; continue; } if (!net_eq(net, nf_ct_net(ct))) continue; if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; if (cb->args[1]) { if (ctnetlink_get_id(ct) != last_id) continue; cb->args[1] = 0; } if (!ctnetlink_filter_match(ct, cb->data)) continue; res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, true, flags); if (res < 0) { cb->args[1] = ctnetlink_get_id(ct); spin_unlock(lockp); goto out; } } spin_unlock(lockp); if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: local_bh_enable(); if (last_id) { /* nf ct hash resize happened, now clear the leftover. */ if (cb->args[1] == last_id) cb->args[1] = 0; } while (i) { i--; if (nf_ct_should_gc(nf_ct_evict[i])) nf_ct_kill(nf_ct_evict[i]); nf_ct_put(nf_ct_evict[i]); } return skb->len; } static int ipv4_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { if (!tb[CTA_IP_V4_SRC]) return -EINVAL; t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); } if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { if (!tb[CTA_IP_V4_DST]) return -EINVAL; t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); } return 0; } static int ipv6_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { if (!tb[CTA_IP_V6_SRC]) return -EINVAL; t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); } if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { if (!tb[CTA_IP_V6_DST]) return -EINVAL; t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); } return 0; } static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple, u_int32_t flags) { struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, cta_ip_nla_policy, NULL); if (ret < 0) return ret; switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_nlattr_to_tuple(tb, tuple, flags); break; case NFPROTO_IPV6: ret = ipv6_nlattr_to_tuple(tb, tuple, flags); break; } return ret; } static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_NUM] = { .type = NLA_U8 }, }; static int ctnetlink_parse_tuple_proto(struct nlattr *attr, struct nf_conntrack_tuple *tuple, u_int32_t flags) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTO_MAX+1]; int ret = 0; ret = nla_parse_nested_deprecated(tb, CTA_PROTO_MAX, attr, proto_nla_policy, NULL); if (ret < 0) return ret; if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM))) return 0; if (!tb[CTA_PROTO_NUM]) return -EINVAL; tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); rcu_read_lock(); l4proto = nf_ct_l4proto_find(tuple->dst.protonum); if (likely(l4proto->nlattr_to_tuple)) { ret = nla_validate_nested_deprecated(attr, CTA_PROTO_MAX, l4proto->nla_policy, NULL); if (ret == 0) ret = l4proto->nlattr_to_tuple(tb, tuple, flags); } rcu_read_unlock(); return ret; } static int ctnetlink_parse_zone(const struct nlattr *attr, struct nf_conntrack_zone *zone) { nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID, NF_CT_DEFAULT_ZONE_DIR, 0); #ifdef CONFIG_NF_CONNTRACK_ZONES if (attr) zone->id = ntohs(nla_get_be16(attr)); #else if (attr) return -EOPNOTSUPP; #endif return 0; } static int ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type, struct nf_conntrack_zone *zone) { int ret; if (zone->id != NF_CT_DEFAULT_ZONE_ID) return -EINVAL; ret = ctnetlink_parse_zone(attr, zone); if (ret < 0) return ret; if (type == CTA_TUPLE_REPLY) zone->dir = NF_CT_ZONE_DIR_REPL; else zone->dir = NF_CT_ZONE_DIR_ORIG; return 0; } static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { [CTA_TUPLE_IP] = { .type = NLA_NESTED }, [CTA_TUPLE_PROTO] = { .type = NLA_NESTED }, [CTA_TUPLE_ZONE] = { .type = NLA_U16 }, }; #define CTA_FILTER_F_ALL_CTA_PROTO \ (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \ CTA_FILTER_F_CTA_PROTO_DST_PORT | \ CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \ CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \ CTA_FILTER_F_CTA_PROTO_ICMP_ID | \ CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \ CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \ CTA_FILTER_F_CTA_PROTO_ICMPV6_ID) static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, u32 type, u_int8_t l3num, struct nf_conntrack_zone *zone, u_int32_t flags) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; memset(tuple, 0, sizeof(*tuple)); err = nla_parse_nested_deprecated(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy, NULL); if (err < 0) return err; if (l3num != NFPROTO_IPV4 && l3num != NFPROTO_IPV6) return -EOPNOTSUPP; tuple->src.l3num = l3num; if (flags & CTA_FILTER_FLAG(CTA_IP_DST) || flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { if (!tb[CTA_TUPLE_IP]) return -EINVAL; err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags); if (err < 0) return err; } if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) { if (!tb[CTA_TUPLE_PROTO]) return -EINVAL; err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags); if (err < 0) return err; } else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) { /* Can't manage proto flags without a protonum */ return -EINVAL; } if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) { if (!zone) return -EINVAL; err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE], type, zone); if (err < 0) return err; } /* orig and expect tuples get DIR_ORIGINAL */ if (type == CTA_TUPLE_REPLY) tuple->dst.dir = IP_CT_DIR_REPLY; else tuple->dst.dir = IP_CT_DIR_ORIGINAL; return 0; } static int ctnetlink_parse_tuple(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, u32 type, u_int8_t l3num, struct nf_conntrack_zone *zone) { return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone, CTA_FILTER_FLAG(ALL)); } static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, }; static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, struct nlattr **helpinfo) { int err; struct nlattr *tb[CTA_HELP_MAX+1]; err = nla_parse_nested_deprecated(tb, CTA_HELP_MAX, attr, help_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_HELP_NAME]) return -EINVAL; *helper_name = nla_data(tb[CTA_HELP_NAME]); if (tb[CTA_HELP_INFO]) *helpinfo = tb[CTA_HELP_INFO]; return 0; } static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_TUPLE_ORIG] = { .type = NLA_NESTED }, [CTA_TUPLE_REPLY] = { .type = NLA_NESTED }, [CTA_STATUS] = { .type = NLA_U32 }, [CTA_PROTOINFO] = { .type = NLA_NESTED }, [CTA_HELP] = { .type = NLA_NESTED }, [CTA_NAT_SRC] = { .type = NLA_NESTED }, [CTA_TIMEOUT] = { .type = NLA_U32 }, [CTA_MARK] = { .type = NLA_U32 }, [CTA_ID] = { .type = NLA_U32 }, [CTA_NAT_DST] = { .type = NLA_NESTED }, [CTA_TUPLE_MASTER] = { .type = NLA_NESTED }, [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NLA_NESTED }, [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED }, [CTA_ZONE] = { .type = NLA_U16 }, [CTA_MARK_MASK] = { .type = NLA_U32 }, [CTA_LABELS] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, [CTA_LABELS_MASK] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, [CTA_FILTER] = { .type = NLA_NESTED }, [CTA_STATUS_MASK] = { .type = NLA_U32 }, [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) { return ctnetlink_filter_match(ct, data); } static int ctnetlink_flush_conntrack(struct net *net, const struct nlattr * const cda[], u32 portid, int report, u8 family) { struct ctnetlink_filter *filter = NULL; struct nf_ct_iter_data iter = { .net = net, .portid = portid, .report = report, }; if (ctnetlink_needs_filter(family, cda)) { filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); iter.data = filter; } nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter); kfree(filter); return 0; } static int ctnetlink_del_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u8 family = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) return err; if (cda[CTA_TUPLE_ORIG] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, family, &zone); else if (cda[CTA_TUPLE_REPLY] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, family, &zone); else { u8 u3 = info->nfmsg->version || cda[CTA_FILTER] ? family : AF_UNSPEC; return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, nlmsg_report(info->nlh), u3); } if (err < 0) return err; h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); if (cda[CTA_ID]) { __be32 id = nla_get_be32(cda[CTA_ID]); if (id != (__force __be32)nf_ct_get_id(ct)) { nf_ct_put(ct); return -ENOENT; } } nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_put(ct); return 0; } static int ctnetlink_get_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; struct sk_buff *skb2; struct nf_conn *ct; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = ctnetlink_start, .dump = ctnetlink_dump_table, .done = ctnetlink_done, .data = (void *)cda, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) return err; if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3, &zone); else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3, &zone); else return -EINVAL; if (err < 0) return err; h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) { nf_ct_put(ct); return -ENOMEM; } err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); if (err <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_dump_one_entry(struct sk_buff *skb, struct netlink_callback *cb, struct nf_conn *ct, bool dying) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u8 l3proto = nfmsg->nfgen_family; int res; if (l3proto && nf_ct_l3num(ct) != l3proto) return 0; if (ctx->last_id) { if (ctnetlink_get_id(ct) != ctx->last_id) return 0; ctx->last_id = 0; } /* We can't dump extension info for the unconfirmed * list because unconfirmed conntracks can have * ct->ext reallocated (and thus freed). * * In the dying list case ct->ext can't be free'd * until after we drop pcpu->lock. */ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying, 0); if (res < 0) ctx->last_id = ctnetlink_get_id(ct); return res; } #endif static int ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) { return 0; } static int ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; #ifdef CONFIG_NF_CONNTRACK_EVENTS const struct net *net = sock_net(skb->sk); struct nf_conntrack_net_ecache *ecache_net; unsigned long last_id = ctx->last_id; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; #endif if (ctx->done) return 0; ctx->last_id = 0; #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache_net = nf_conn_pernet_ecache(net); spin_lock_bh(&ecache_net->dying_lock); hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) { struct nf_conn *ct; int res; ct = nf_ct_tuplehash_to_ctrack(h); if (last_id && last_id != ctnetlink_get_id(ct)) continue; res = ctnetlink_dump_one_entry(skb, cb, ct, true); if (res < 0) { spin_unlock_bh(&ecache_net->dying_lock); return skb->len; } last_id = 0; } spin_unlock_bh(&ecache_net->dying_lock); #endif ctx->done = true; return skb->len; } static int ctnetlink_get_ct_dying(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; } static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; } #if IS_ENABLED(CONFIG_NF_NAT) static int ctnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) __must_hold(RCU) { const struct nf_nat_hook *nat_hook; int err; nat_hook = rcu_dereference(nf_nat_hook); if (!nat_hook) { #ifdef CONFIG_MODULES rcu_read_unlock(); nfnl_unlock(NFNL_SUBSYS_CTNETLINK); if (request_module("nf-nat") < 0) { nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); return -EOPNOTSUPP; } nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook) return -EAGAIN; #endif return -EOPNOTSUPP; } err = nat_hook->parse_nat_setup(ct, manip, attr); if (err == -EAGAIN) { #ifdef CONFIG_MODULES rcu_read_unlock(); nfnl_unlock(NFNL_SUBSYS_CTNETLINK); if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) { nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); return -EOPNOTSUPP; } nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); #else err = -EOPNOTSUPP; #endif } return err; } #endif static int ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) { return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS]))); } static int ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) { #if IS_ENABLED(CONFIG_NF_NAT) int ret; if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) return 0; ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST, cda[CTA_NAT_DST]); if (ret < 0) return ret; return ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, cda[CTA_NAT_SRC]); #else if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) return 0; return -EOPNOTSUPP; #endif } static int ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) { struct nf_conntrack_helper *helper; struct nf_conn_help *help = nfct_help(ct); char *helpname = NULL; struct nlattr *helpinfo = NULL; int err; err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); if (err < 0) return err; /* don't change helper of sibling connections */ if (ct->master) { /* If we try to change the helper to the same thing twice, * treat the second attempt as a no-op instead of returning * an error. */ err = -EBUSY; if (help) { rcu_read_lock(); helper = rcu_dereference(help->helper); if (helper && !strcmp(helper->name, helpname)) err = 0; rcu_read_unlock(); } return err; } if (!strcmp(helpname, "")) { if (help && help->helper) { /* we had a helper before ... */ nf_ct_remove_expectations(ct); RCU_INIT_POINTER(help->helper, NULL); } return 0; } rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); return -EOPNOTSUPP; } if (help) { if (rcu_access_pointer(help->helper) == helper) { /* update private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); err = 0; } else err = -EBUSY; } else { /* we cannot set a helper for an existing conntrack */ err = -EOPNOTSUPP; } rcu_read_unlock(); return err; } static int ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) { return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ); } #if defined(CONFIG_NF_CONNTRACK_MARK) static void ctnetlink_change_mark(struct nf_conn *ct, const struct nlattr * const cda[]) { u32 mark, newmark, mask = 0; if (cda[CTA_MARK_MASK]) mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); mark = ntohl(nla_get_be32(cda[CTA_MARK])); newmark = (READ_ONCE(ct->mark) & mask) ^ mark; if (newmark != READ_ONCE(ct->mark)) WRITE_ONCE(ct->mark, newmark); } #endif static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, }; static int ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]) { const struct nlattr *attr = cda[CTA_PROTOINFO]; const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTOINFO_MAX+1]; int err = 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy, NULL); if (err < 0) return err; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->from_nlattr) err = l4proto->from_nlattr(tb, ct); return err; } static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 }, [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 }, [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 }, }; static int change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr) { int err; struct nlattr *cda[CTA_SEQADJ_MAX+1]; err = nla_parse_nested_deprecated(cda, CTA_SEQADJ_MAX, attr, seqadj_policy, NULL); if (err < 0) return err; if (!cda[CTA_SEQADJ_CORRECTION_POS]) return -EINVAL; seq->correction_pos = ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS])); if (!cda[CTA_SEQADJ_OFFSET_BEFORE]) return -EINVAL; seq->offset_before = ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE])); if (!cda[CTA_SEQADJ_OFFSET_AFTER]) return -EINVAL; seq->offset_after = ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER])); return 0; } static int ctnetlink_change_seq_adj(struct nf_conn *ct, const struct nlattr * const cda[]) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); int ret = 0; if (!seqadj) return 0; spin_lock_bh(&ct->lock); if (cda[CTA_SEQ_ADJ_ORIG]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) goto err; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } if (cda[CTA_SEQ_ADJ_REPLY]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) goto err; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } spin_unlock_bh(&ct->lock); return 0; err: spin_unlock_bh(&ct->lock); return ret; } static const struct nla_policy synproxy_policy[CTA_SYNPROXY_MAX + 1] = { [CTA_SYNPROXY_ISN] = { .type = NLA_U32 }, [CTA_SYNPROXY_ITS] = { .type = NLA_U32 }, [CTA_SYNPROXY_TSOFF] = { .type = NLA_U32 }, }; static int ctnetlink_change_synproxy(struct nf_conn *ct, const struct nlattr * const cda[]) { struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); struct nlattr *tb[CTA_SYNPROXY_MAX + 1]; int err; if (!synproxy) return 0; err = nla_parse_nested_deprecated(tb, CTA_SYNPROXY_MAX, cda[CTA_SYNPROXY], synproxy_policy, NULL); if (err < 0) return err; if (!tb[CTA_SYNPROXY_ISN] || !tb[CTA_SYNPROXY_ITS] || !tb[CTA_SYNPROXY_TSOFF]) return -EINVAL; synproxy->isn = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ISN])); synproxy->its = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ITS])); synproxy->tsoff = ntohl(nla_get_be32(tb[CTA_SYNPROXY_TSOFF])); return 0; } static int ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) { #ifdef CONFIG_NF_CONNTRACK_LABELS size_t len = nla_len(cda[CTA_LABELS]); const void *mask = cda[CTA_LABELS_MASK]; if (len & (sizeof(u32)-1)) /* must be multiple of u32 */ return -EINVAL; if (mask) { if (nla_len(cda[CTA_LABELS_MASK]) == 0 || nla_len(cda[CTA_LABELS_MASK]) != len) return -EINVAL; mask = nla_data(cda[CTA_LABELS_MASK]); } len /= sizeof(u32); return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len); #else return -EOPNOTSUPP; #endif } static int ctnetlink_change_conntrack(struct nf_conn *ct, const struct nlattr * const cda[]) { int err; /* only allow NAT changes and master assignation for new conntracks */ if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER]) return -EOPNOTSUPP; if (cda[CTA_HELP]) { err = ctnetlink_change_helper(ct, cda); if (err < 0) return err; } if (cda[CTA_TIMEOUT]) { err = ctnetlink_change_timeout(ct, cda); if (err < 0) return err; } if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) return err; } if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) return err; } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ctnetlink_change_mark(ct, cda); #endif if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) return err; } if (cda[CTA_SYNPROXY]) { err = ctnetlink_change_synproxy(ct, cda); if (err < 0) return err; } if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) return err; } return 0; } static struct nf_conn * ctnetlink_create_conntrack(struct net *net, const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], struct nf_conntrack_tuple *otuple, struct nf_conntrack_tuple *rtuple, u8 u3) { struct nf_conn *ct; int err = -EINVAL; struct nf_conntrack_helper *helper; struct nf_conn_tstamp *tstamp; u64 timeout; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) return ERR_PTR(-ENOMEM); if (!cda[CTA_TIMEOUT]) goto err1; rcu_read_lock(); if (cda[CTA_HELP]) { char *helpname = NULL; struct nlattr *helpinfo = NULL; err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); if (err < 0) goto err2; helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { err = -EOPNOTSUPP; goto err1; } rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper) { err = -EAGAIN; goto err2; } rcu_read_unlock(); #endif err = -EOPNOTSUPP; goto err1; } else { struct nf_conn_help *help; help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help == NULL) { err = -ENOMEM; goto err2; } /* set private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); /* disable helper auto-assignment for this entry */ ct->status |= IPS_HELPER; RCU_INIT_POINTER(help->helper, helper); } } err = ctnetlink_setup_nat(ct, cda); if (err < 0) goto err2; nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); nf_ct_labels_ext_add(ct); nfct_seqadj_ext_add(ct); nfct_synproxy_ext_add(ct); /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; __nf_ct_set_timeout(ct, timeout); if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) goto err2; } if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) goto err2; } memset(&ct->proto, 0, sizeof(ct->proto)); if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) goto err2; } if (cda[CTA_SYNPROXY]) { err = ctnetlink_change_synproxy(ct, cda); if (err < 0) goto err2; } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ctnetlink_change_mark(ct, cda); #endif /* setup master conntrack: this is a confirmed expectation */ if (cda[CTA_TUPLE_MASTER]) { struct nf_conntrack_tuple master; struct nf_conntrack_tuple_hash *master_h; struct nf_conn *master_ct; err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3, NULL); if (err < 0) goto err2; master_h = nf_conntrack_find_get(net, zone, &master); if (master_h == NULL) { err = -ENOENT; goto err2; } master_ct = nf_ct_tuplehash_to_ctrack(master_h); __set_bit(IPS_EXPECTED_BIT, &ct->status); ct->master = master_ct; } tstamp = nf_conn_tstamp_find(ct); if (tstamp) tstamp->start = ktime_get_real_ns(); err = nf_conntrack_hash_check_insert(ct); if (err < 0) goto err3; rcu_read_unlock(); return ct; err3: if (ct->master) nf_ct_put(ct->master); err2: rcu_read_unlock(); err1: nf_conntrack_free(ct); return ERR_PTR(err); } static int ctnetlink_new_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_zone zone; struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) return err; if (cda[CTA_TUPLE_ORIG]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_REPLY]) { err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_ORIG]) h = nf_conntrack_find_get(info->net, &zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) h = nf_conntrack_find_get(info->net, &zone, &rtuple); if (h == NULL) { err = -ENOENT; if (info->nlh->nlmsg_flags & NLM_F_CREATE) { enum ip_conntrack_events events; if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) return -EINVAL; if (otuple.dst.protonum != rtuple.dst.protonum) return -EINVAL; ct = ctnetlink_create_conntrack(info->net, &zone, cda, &otuple, &rtuple, u3); if (IS_ERR(ct)) return PTR_ERR(ct); err = 0; if (test_bit(IPS_EXPECTED_BIT, &ct->status)) events = 1 << IPCT_RELATED; else events = 1 << IPCT_NEW; if (cda[CTA_LABELS] && ctnetlink_attach_labels(ct, cda) == 0) events |= (1 << IPCT_LABEL); nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | (1 << IPCT_MARK) | (1 << IPCT_SYNPROXY) | events, ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_put(ct); } return err; } /* implicit 'else' */ err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) { err = ctnetlink_change_conntrack(ct, cda); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | (1 << IPCT_LABEL) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | (1 << IPCT_MARK) | (1 << IPCT_SYNPROXY), ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } } nf_ct_put(ct); return err; } static int ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, __u16 cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS_CPU); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, htons(cpu)); if (!nlh) goto nlmsg_failure; if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || nla_put_be32(skb, CTA_STATS_INSERT_FAILED, htonl(st->insert_failed)) || nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) || nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) || nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) || nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, htonl(st->search_restart)) || nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, htonl(st->clash_resolve)) || nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG, htonl(st->chaintoolong))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nla_put_failure: nlmsg_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) { int cpu; struct net *net = sock_net(skb->sk); if (cb->args[0] == nr_cpu_ids) return 0; for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { const struct ip_conntrack_stat *st; if (!cpu_possible(cpu)) continue; st = per_cpu_ptr(net->ct.stat, cpu); if (ctnetlink_ct_stat_cpu_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cpu, st) < 0) break; } cb->args[0] = cpu; return skb->len; } static int ctnetlink_stat_ct_cpu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_ct_stat_cpu_dump, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; } static int ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct net *net) { unsigned int flags = portid ? NLM_F_MULTI : 0, event; unsigned int nr_conntracks; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; nr_conntracks = nf_conntrack_count(net); if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nla_put_failure: nlmsg_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct sk_buff *skb2; int err; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) return -ENOMEM; err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), sock_net(skb->sk)); if (err <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, [CTA_EXPECT_ID] = { .type = NLA_U32 }, [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, }; static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, struct nf_conntrack_helper *helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT static size_t ctnetlink_glue_build_size(const struct nf_conn *ct) { return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + nla_total_size(0) /* CTA_PROTOINFO */ + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) + ctnetlink_acct_size(ct) + ctnetlink_timestamp_size(ct) #if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ #endif #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) ; } static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_ORIG) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_REPL) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; if (ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if (ctnetlink_dump_acct(skb, ct, IPCTNL_MSG_CT_GET) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_helpinfo(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_SECMARK if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0) goto nla_put_failure; #endif if (ct->master && ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; if ((ct->status & IPS_SEQ_ADJUST) && ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_ct_synproxy(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK if (ctnetlink_dump_mark(skb, ct, true) < 0) goto nla_put_failure; #endif if (ctnetlink_dump_labels(skb, ct) < 0) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static int ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u_int16_t ct_attr, u_int16_t ct_info_attr) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, ct_attr); if (!nest_parms) goto nla_put_failure; if (__ctnetlink_glue_build(skb, ct) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (nla_put_be32(skb, ct_info_attr, htonl(ctinfo))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static int ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[]) { unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS])); unsigned long d = ct->status ^ status; if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) /* SEEN_REPLY bit can only be set */ return -EBUSY; if (d & IPS_ASSURED && !(status & IPS_ASSURED)) /* ASSURED bit can only be set */ return -EBUSY; /* This check is less strict than ctnetlink_change_status() * because callers often flip IPS_EXPECTED bits when sending * an NFQA_CT attribute to the kernel. So ignore the * unchangeable bits but do not error out. Also user programs * are allowed to clear the bits that they are allowed to change. */ __nf_ct_change_status(ct, status, ~status); return 0; } static int ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) { int err; if (cda[CTA_TIMEOUT]) { err = ctnetlink_change_timeout(ct, cda); if (err < 0) return err; } if (cda[CTA_STATUS]) { err = ctnetlink_update_status(ct, cda); if (err < 0) return err; } if (cda[CTA_HELP]) { err = ctnetlink_change_helper(ct, cda); if (err < 0) return err; } if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) return err; } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) { ctnetlink_change_mark(ct, cda); } #endif return 0; } static int ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) { struct nlattr *cda[CTA_MAX+1]; int ret; ret = nla_parse_nested_deprecated(cda, CTA_MAX, attr, ct_nla_policy, NULL); if (ret < 0) return ret; return ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); } static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda, const struct nf_conn *ct, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { int err; err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, nf_ct_l3num(ct), NULL); if (err < 0) return err; return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, nf_ct_l3num(ct), NULL); } static int ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report) { struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; struct nf_conntrack_helper *helper = NULL; struct nf_conntrack_expect *exp; int err; err = nla_parse_nested_deprecated(cda, CTA_EXPECT_MAX, attr, exp_nla_policy, NULL); if (err < 0) return err; err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda, ct, &tuple, &mask); if (err < 0) return err; if (cda[CTA_EXPECT_HELP_NAME]) { const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) return -EOPNOTSUPP; } exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, helper, &tuple, &mask); if (IS_ERR(exp)) return PTR_ERR(exp); err = nf_ct_expect_related_report(exp, portid, report, 0); nf_ct_expect_put(exp); return err; } static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff) { if (!(ct->status & IPS_NAT_MASK)) return; nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff); } static const struct nfnl_ct_hook ctnetlink_glue_hook = { .build_size = ctnetlink_glue_build_size, .build = ctnetlink_glue_build, .parse = ctnetlink_glue_parse, .attach_expect = ctnetlink_glue_attach_expect, .seq_adjust = ctnetlink_glue_seqadj, }; #endif /* CONFIG_NETFILTER_NETLINK_GLUE_CT */ /*********************************************************************** * EXPECT ***********************************************************************/ static int ctnetlink_exp_dump_tuple(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, u32 type) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, type); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, tuple) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int ctnetlink_exp_dump_mask(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple_mask *mask) { const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple m; struct nlattr *nest_parms; int ret; memset(&m, 0xFF, sizeof(m)); memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3)); m.src.u.all = mask->src.u.all; m.src.l3num = tuple->src.l3num; m.dst.protonum = tuple->dst.protonum; nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK); if (!nest_parms) goto nla_put_failure; rcu_read_lock(); ret = ctnetlink_dump_tuples_ip(skb, &m); if (ret >= 0) { l4proto = nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); } rcu_read_unlock(); if (unlikely(ret < 0)) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } #if IS_ENABLED(CONFIG_NF_NAT) static const union nf_inet_addr any_addr; #endif static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) { static siphash_aligned_key_t exp_id_seed; unsigned long a, b, c, d; net_get_random_once(&exp_id_seed, sizeof(exp_id_seed)); a = (unsigned long)exp; b = (unsigned long)exp->helper; c = (unsigned long)exp->master; d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed); #ifdef CONFIG_64BIT return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed); #else return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed); #endif } static int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) { struct nf_conn *master = exp->master; long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ; struct nf_conn_help *help; #if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *nest_parms; struct nf_conntrack_tuple nat_tuple = {}; #endif struct nf_ct_helper_expectfn *expfn; if (timeout < 0) timeout = 0; if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) goto nla_put_failure; if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0) goto nla_put_failure; if (ctnetlink_exp_dump_tuple(skb, &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, CTA_EXPECT_MASTER) < 0) goto nla_put_failure; #if IS_ENABLED(CONFIG_NF_NAT) if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) || exp->saved_proto.all) { nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT); if (!nest_parms) goto nla_put_failure; if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir))) goto nla_put_failure; nat_tuple.src.l3num = nf_ct_l3num(master); nat_tuple.src.u3 = exp->saved_addr; nat_tuple.dst.protonum = nf_ct_protonum(master); nat_tuple.src.u = exp->saved_proto; if (ctnetlink_exp_dump_tuple(skb, &nat_tuple, CTA_EXPECT_NAT_TUPLE) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); } #endif if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) || nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) goto nla_put_failure; help = nfct_help(master); if (help) { struct nf_conntrack_helper *helper; helper = rcu_dereference(help->helper); if (helper && nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name)) goto nla_put_failure; } expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn); if (expfn != NULL && nla_put_string(skb, CTA_EXPECT_FN, expfn->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct nf_conntrack_expect *exp) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, exp->tuple.src.l3num, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item) { struct nf_conntrack_expect *exp = item->exp; struct net *net = nf_ct_exp_net(exp); struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int type, group; int flags = 0; if (events & (1 << IPEXP_DESTROY)) { type = IPCTNL_MSG_EXP_DELETE; group = NFNLGRP_CONNTRACK_EXP_DESTROY; } else if (events & (1 << IPEXP_NEW)) { type = IPCTNL_MSG_EXP_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; group = NFNLGRP_CONNTRACK_EXP_NEW; } else return 0; if (!item->report && !nfnetlink_has_listeners(net, group)) return 0; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (skb == NULL) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type); nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, exp->tuple.src.l3num, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); nlmsg_failure: kfree_skb(skb); errout: nfnetlink_set_err(net, 0, 0, -ENOBUFS); return 0; } #endif static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp) { unsigned long id = (unsigned long)exp; id += nf_ct_get_id(exp->master); id += exp->class; return id ? id : 1; } static int ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; unsigned long last_id = cb->args[1]; struct nf_conntrack_expect *exp; rcu_read_lock(); for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; if (!net_eq(nf_ct_net(exp->master), net)) continue; if (cb->args[1]) { if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { cb->args[1] = ctnetlink_exp_id(exp); goto out; } } if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: rcu_read_unlock(); return skb->len; } static int ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nf_conn *ct = cb->data; struct nf_conn_help *help = nfct_help(ct); u_int8_t l3proto = nfmsg->nfgen_family; unsigned long last_id = cb->args[1]; struct nf_conntrack_expect *exp; if (cb->args[0]) return 0; rcu_read_lock(); restart: hlist_for_each_entry_rcu(exp, &help->expectations, lnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; if (cb->args[1]) { if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { cb->args[1] = ctnetlink_exp_id(exp); goto out; } } if (cb->args[1]) { cb->args[1] = 0; goto restart; } cb->args[0] = 1; out: rcu_read_unlock(); return skb->len; } static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[], struct netlink_ext_ack *extack) { int err; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, }; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3, NULL); if (err < 0) return err; err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); /* No expectation linked to this connection tracking. */ if (!nfct_help(ct)) { nf_ct_put(ct); return 0; } c.data = ct; err = netlink_dump_start(ctnl, skb, nlh, &c); nf_ct_put(ct); return err; } static int ctnetlink_get_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; struct sk_buff *skb2; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { if (cda[CTA_EXPECT_MASTER]) return ctnetlink_dump_exp_ct(info->net, info->sk, skb, info->nlh, cda, info->extack); else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } } err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; if (cda[CTA_EXPECT_TUPLE]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); else if (cda[CTA_EXPECT_MASTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3, NULL); else return -EINVAL; if (err < 0) return err; exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) return -ENOENT; if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); return -ENOENT; } } skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) { nf_ct_expect_put(exp); return -ENOMEM; } rcu_read_lock(); err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); rcu_read_unlock(); nf_ct_expect_put(exp); if (err <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) { struct nf_conntrack_helper *helper; const struct nf_conn_help *m_help; const char *name = data; m_help = nfct_help(exp->master); helper = rcu_dereference(m_help->helper); if (!helper) return false; return strcmp(helper->name, name) == 0; } static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data) { return true; } static int ctnetlink_del_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; int err; if (cda[CTA_EXPECT_TUPLE]) { /* delete a single expect by tuple */ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); if (err < 0) return err; /* bump usage count to 2 */ exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) return -ENOENT; if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); return -ENOENT; } } /* after list removal, usage count == 1 */ spin_lock_bh(&nf_conntrack_expect_lock); if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_expect_put(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); /* have to put what we 'get' above. * after this line usage count == 0 */ nf_ct_expect_put(exp); } else if (cda[CTA_EXPECT_HELP_NAME]) { char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]); nf_ct_expect_iterate_net(info->net, expect_iter_name, name, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } else { /* This basically means we have to flush everything*/ nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } return 0; } static int ctnetlink_change_expect(struct nf_conntrack_expect *x, const struct nlattr * const cda[]) { if (cda[CTA_EXPECT_TIMEOUT]) { if (!timer_delete(&x->timeout)) return -ETIME; x->timeout.expires = jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; add_timer(&x->timeout); } return 0; } #if IS_ENABLED(CONFIG_NF_NAT) static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 }, [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, }; #endif static int ctnetlink_parse_expect_nat(const struct nlattr *attr, struct nf_conntrack_expect *exp, u_int8_t u3) { #if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *tb[CTA_EXPECT_NAT_MAX+1]; struct nf_conntrack_tuple nat_tuple = {}; int err; err = nla_parse_nested_deprecated(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE]) return -EINVAL; err = ctnetlink_parse_tuple((const struct nlattr * const *)tb, &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3, NULL); if (err < 0) return err; exp->saved_addr = nat_tuple.src.u3; exp->saved_proto = nat_tuple.src.u; exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR])); return 0; #else return -EOPNOTSUPP; #endif } static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, struct nf_conntrack_helper *helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { u_int32_t class = 0; struct nf_conntrack_expect *exp; struct nf_conn_help *help; int err; help = nfct_help(ct); if (!help) return ERR_PTR(-EOPNOTSUPP); if (cda[CTA_EXPECT_CLASS] && helper) { class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS])); if (class > helper->expect_class_max) return ERR_PTR(-EINVAL); } exp = nf_ct_expect_alloc(ct); if (!exp) return ERR_PTR(-ENOMEM); if (cda[CTA_EXPECT_FLAGS]) { exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); exp->flags &= ~NF_CT_EXPECT_USERSPACE; } else { exp->flags = 0; } if (cda[CTA_EXPECT_FN]) { const char *name = nla_data(cda[CTA_EXPECT_FN]); struct nf_ct_helper_expectfn *expfn; expfn = nf_ct_helper_expectfn_find_by_name(name); if (expfn == NULL) { err = -EINVAL; goto err_out; } exp->expectfn = expfn->expectfn; } else exp->expectfn = NULL; exp->class = class; exp->master = ct; exp->helper = helper; exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; exp->mask.src.u.all = mask->src.u.all; if (cda[CTA_EXPECT_NAT]) { err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT], exp, nf_ct_l3num(ct)); if (err < 0) goto err_out; } return exp; err_out: nf_ct_expect_put(exp); return ERR_PTR(err); } static int ctnetlink_create_expect(struct net *net, const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], u_int8_t u3, u32 portid, int report) { struct nf_conntrack_tuple tuple, mask, master_tuple; struct nf_conntrack_tuple_hash *h = NULL; struct nf_conntrack_helper *helper = NULL; struct nf_conntrack_expect *exp; struct nf_conn *ct; int err; /* caller guarantees that those three CTA_EXPECT_* exist */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3, NULL); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3, NULL); if (err < 0) return err; /* Look for master conntrack of this expectation */ h = nf_conntrack_find_get(net, zone, &master_tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); rcu_read_lock(); if (cda[CTA_EXPECT_HELP_NAME]) { const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); helper = __nf_conntrack_helper_find(helpname, u3, nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { err = -EOPNOTSUPP; goto err_ct; } rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, u3, nf_ct_protonum(ct)); if (helper) { err = -EAGAIN; goto err_rcu; } rcu_read_unlock(); #endif err = -EOPNOTSUPP; goto err_ct; } } exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto err_rcu; } err = nf_ct_expect_related_report(exp, portid, report, 0); nf_ct_expect_put(exp); err_rcu: rcu_read_unlock(); err_ct: nf_ct_put(ct); return err; } static int ctnetlink_new_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; int err; if (!cda[CTA_EXPECT_TUPLE] || !cda[CTA_EXPECT_MASK] || !cda[CTA_EXPECT_MASTER]) return -EINVAL; err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); if (err < 0) return err; spin_lock_bh(&nf_conntrack_expect_lock); exp = __nf_ct_expect_find(info->net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; if (info->nlh->nlmsg_flags & NLM_F_CREATE) { err = ctnetlink_create_expect(info->net, &zone, cda, u3, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } return err; } err = -EEXIST; if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) err = ctnetlink_change_expect(exp, cda); spin_unlock_bh(&nf_conntrack_expect_lock); return err; } static int ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_EXP_GET_STATS_CPU); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, htons(cpu)); if (!nlh) goto nlmsg_failure; if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) || nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) || nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nla_put_failure: nlmsg_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) { int cpu; struct net *net = sock_net(skb->sk); if (cb->args[0] == nr_cpu_ids) return 0; for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { const struct ip_conntrack_stat *st; if (!cpu_possible(cpu)) continue; st = per_cpu_ptr(net->ct.stat, cpu); if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cpu, st) < 0) break; } cb->args[0] = cpu; return skb->len; } static int ctnetlink_stat_exp_cpu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_exp_stat_cpu_dump, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; } #ifdef CONFIG_NF_CONNTRACK_EVENTS static struct nf_ct_event_notifier ctnl_notifier = { .ct_event = ctnetlink_conntrack_event, .exp_event = ctnetlink_expect_event, }; #endif static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu, .type = NFNL_CB_MUTEX, }, [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct, .type = NFNL_CB_MUTEX, }, [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying, .type = NFNL_CB_MUTEX, }, [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed, .type = NFNL_CB_MUTEX, }, }; static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, .type = NFNL_CB_MUTEX, .attr_count = CTA_EXPECT_MAX, .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, .type = NFNL_CB_MUTEX, .attr_count = CTA_EXPECT_MAX, .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, .type = NFNL_CB_MUTEX, .attr_count = CTA_EXPECT_MAX, .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu, .type = NFNL_CB_MUTEX, }, }; static const struct nfnetlink_subsystem ctnl_subsys = { .name = "conntrack", .subsys_id = NFNL_SUBSYS_CTNETLINK, .cb_count = IPCTNL_MSG_MAX, .cb = ctnl_cb, }; static const struct nfnetlink_subsystem ctnl_exp_subsys = { .name = "conntrack_expect", .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, .cb_count = IPCTNL_MSG_EXP_MAX, .cb = ctnl_exp_cb, }; MODULE_ALIAS("ip_conntrack_netlink"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); static int __net_init ctnetlink_net_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS nf_conntrack_register_notifier(net, &ctnl_notifier); #endif return 0; } static void ctnetlink_net_pre_exit(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS nf_conntrack_unregister_notifier(net); #endif } static struct pernet_operations ctnetlink_net_ops = { .init = ctnetlink_net_init, .pre_exit = ctnetlink_net_pre_exit, }; static int __init ctnetlink_init(void) { int ret; NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { pr_err("ctnetlink_init: cannot register with nfnetlink.\n"); goto err_out; } ret = nfnetlink_subsys_register(&ctnl_exp_subsys); if (ret < 0) { pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n"); goto err_unreg_subsys; } ret = register_pernet_subsys(&ctnetlink_net_ops); if (ret < 0) { pr_err("ctnetlink_init: cannot register pernet operations\n"); goto err_unreg_exp_subsys; } #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT /* setup interaction between nf_queue and nf_conntrack_netlink. */ RCU_INIT_POINTER(nfnl_ct_hook, &ctnetlink_glue_hook); #endif return 0; err_unreg_exp_subsys: nfnetlink_subsys_unregister(&ctnl_exp_subsys); err_unreg_subsys: nfnetlink_subsys_unregister(&ctnl_subsys); err_out: return ret; } static void __exit ctnetlink_exit(void) { unregister_pernet_subsys(&ctnetlink_net_ops); nfnetlink_subsys_unregister(&ctnl_exp_subsys); nfnetlink_subsys_unregister(&ctnl_subsys); #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT RCU_INIT_POINTER(nfnl_ct_hook, NULL); #endif synchronize_rcu(); } module_init(ctnetlink_init); module_exit(ctnetlink_exit); |
| 7 1 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNWIND_H #define _ASM_X86_UNWIND_H #include <linux/sched.h> #include <linux/ftrace.h> #include <linux/rethook.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> #define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) #define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; struct task_struct *task; int graph_idx; #if defined(CONFIG_RETHOOK) struct llist_node *kr_cur; #endif bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; /* * If non-NULL: The current frame is incomplete and doesn't contain a * valid BP. When looking for the next frame, use this instead of the * non-existent saved BP. */ unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; #endif }; void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); bool unwind_next_frame(struct unwind_state *state); unsigned long unwind_get_return_address(struct unwind_state *state); unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } static inline bool unwind_error(struct unwind_state *state) { return state->error; } static inline void unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { first_frame = first_frame ? : get_stack_pointer(task, regs); __unwind_start(state, task, regs, first_frame); } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) /* * If 'partial' returns true, only the iret frame registers are valid. */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { if (unwind_done(state)) return NULL; if (partial) { #ifdef CONFIG_UNWINDER_ORC *partial = !state->full_regs; #else *partial = false; #endif } return state->regs; } #else static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { return NULL; } #endif #ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); #else static inline void unwind_init(void) {} static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} #endif static inline unsigned long unwind_recover_rethook(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { #ifdef CONFIG_RETHOOK if (is_rethook_trampoline(addr)) return rethook_find_ret_addr(state->task, (unsigned long)addr_p, &state->kr_cur); #endif return addr; } /* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { unsigned long ret; ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); return unwind_recover_rethook(state, ret, addr_p); } /* * This disables KASAN checking when reading a value from another task's stack, * since the other task could be running on another CPU and could have poisoned * the stack in the meantime. */ #define READ_ONCE_TASK_STACK(task, x) \ ({ \ unsigned long val; \ if (task == current) \ val = READ_ONCE(x); \ else \ val = READ_ONCE_NOCHECK(x); \ val; \ }) static inline bool task_on_another_cpu(struct task_struct *task) { #ifdef CONFIG_SMP return task != current && task->on_cpu; #else return false; #endif } #endif /* _ASM_X86_UNWIND_H */ |
| 191 251 228 240 8 130 96 26 147 8 229 40 1 39 14 24 1 87 2 25 41 3 100 145 63 133 25 88 84 23 113 72 61 67 1 36 27 8 8 1 59 110 168 21 106 106 59 143 228 74 51 8 36 36 16 68 70 38 31 3 2 9 52 46 46 46 46 162 3 161 35 3 3 162 162 24 162 161 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 | /* * net/tipc/msg.h: Include file for TIPC message header routines * * Copyright (c) 2000-2007, 2014-2017 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_MSG_H #define _TIPC_MSG_H #include <linux/tipc.h> #include "core.h" /* * Constants and routines used to read and write TIPC payload message headers * * Note: Some items are also used with TIPC internal message headers */ #define TIPC_VERSION 2 struct plist; /* * Payload message users are defined in TIPC's public API: * - TIPC_LOW_IMPORTANCE * - TIPC_MEDIUM_IMPORTANCE * - TIPC_HIGH_IMPORTANCE * - TIPC_CRITICAL_IMPORTANCE */ #define TIPC_SYSTEM_IMPORTANCE 4 /* * Payload message types */ #define TIPC_CONN_MSG 0 #define TIPC_MCAST_MSG 1 #define TIPC_NAMED_MSG 2 #define TIPC_DIRECT_MSG 3 #define TIPC_GRP_MEMBER_EVT 4 #define TIPC_GRP_BCAST_MSG 5 #define TIPC_GRP_MCAST_MSG 6 #define TIPC_GRP_UCAST_MSG 7 /* * Internal message users */ #define BCAST_PROTOCOL 5 #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 #define GROUP_PROTOCOL 9 #define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 #define MSG_CRYPTO 14 #define SOCK_WAKEUP 14 /* pseudo user */ #define TOP_SRV 15 /* pseudo user */ /* * Message header sizes */ #define SHORT_H_SIZE 24 /* In-cluster basic payload message */ #define BASIC_H_SIZE 32 /* Basic payload message */ #define NAMED_H_SIZE 40 /* Named payload message */ #define MCAST_H_SIZE 44 /* Multicast payload message */ #define GROUP_H_SIZE 44 /* Group payload message */ #define INT_H_SIZE 40 /* Internal messages */ #define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) #define TIPC_MEDIA_INFO_OFFSET 5 extern const int one_page_mtu; struct tipc_skb_cb { union { struct { struct sk_buff *tail; unsigned long nxt_retr; unsigned long retr_stamp; u32 bytes_read; u32 orig_member; u16 chain_imp; u16 ackers; u16 retr_cnt; } __packed; #ifdef CONFIG_TIPC_CRYPTO struct { struct tipc_crypto *rx; struct tipc_aead *last; u8 recurs; } tx_clone_ctx __packed; #endif } __packed; union { struct { u8 validated:1; #ifdef CONFIG_TIPC_CRYPTO u8 encrypted:1; u8 decrypted:1; #define SKB_PROBING 1 #define SKB_GRACING 2 u8 xmit_type:2; u8 tx_clone_deferred:1; #endif }; u8 flags; }; u8 reserved; #ifdef CONFIG_TIPC_CRYPTO void *crypto_ctx; #endif } __packed; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) struct tipc_msg { __be32 hdr[15]; }; /* struct tipc_gap_ack - TIPC Gap ACK block * @ack: seqno of the last consecutive packet in link deferdq * @gap: number of gap packets since the last ack * * E.g: * link deferdq: 1 2 3 4 10 11 13 14 15 20 * --> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0> */ struct tipc_gap_ack { __be16 ack; __be16 gap; }; /* struct tipc_gap_ack_blks * @len: actual length of the record * @ugack_cnt: number of Gap ACK blocks for unicast (following the broadcast * ones) * @start_index: starting index for "valid" broadcast Gap ACK blocks * @bgack_cnt: number of Gap ACK blocks for broadcast in the record * @gacks: array of Gap ACK blocks * * 31 16 15 0 * +-------------+-------------+-------------+-------------+ * | bgack_cnt | ugack_cnt | len | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > bc gacks * : : : | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > uc gacks * : : : | * +-------------+-------------+-------------+-------------+ - */ struct tipc_gap_ack_blks { __be16 len; union { u8 ugack_cnt; u8 start_index; }; u8 bgack_cnt; struct tipc_gap_ack gacks[]; }; #define MAX_GAP_ACK_BLKS 128 #define MAX_GAP_ACK_BLKS_SZ (sizeof(struct tipc_gap_ack_blks) + \ sizeof(struct tipc_gap_ack) * MAX_GAP_ACK_BLKS) static inline struct tipc_msg *buf_msg(struct sk_buff *skb) { return (struct tipc_msg *)skb->data; } static inline u32 msg_word(struct tipc_msg *m, u32 pos) { return ntohl(m->hdr[pos]); } static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val) { m->hdr[w] = htonl(val); } static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask) { return (msg_word(m, w) >> pos) & mask; } static inline void msg_set_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask, u32 val) { val = (val & mask) << pos; mask = mask << pos; m->hdr[w] &= ~htonl(mask); m->hdr[w] |= htonl(val); } /* * Word 0 */ static inline u32 msg_version(struct tipc_msg *m) { return msg_bits(m, 0, 29, 7); } static inline void msg_set_version(struct tipc_msg *m) { msg_set_bits(m, 0, 29, 7, TIPC_VERSION); } static inline u32 msg_user(struct tipc_msg *m) { return msg_bits(m, 0, 25, 0xf); } static inline u32 msg_isdata(struct tipc_msg *m) { return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE; } static inline void msg_set_user(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 25, 0xf, n); } static inline u32 msg_hdr_sz(struct tipc_msg *m) { return msg_bits(m, 0, 21, 0xf) << 2; } static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 21, 0xf, n>>2); } static inline u32 msg_size(struct tipc_msg *m) { return msg_bits(m, 0, 0, 0x1ffff); } static inline u32 msg_blocks(struct tipc_msg *m) { return (msg_size(m) / 1024) + 1; } static inline u32 msg_data_sz(struct tipc_msg *m) { return msg_size(m) - msg_hdr_sz(m); } static inline int msg_non_seq(struct tipc_msg *m) { return msg_bits(m, 0, 20, 1); } static inline void msg_set_non_seq(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 20, 1, n); } static inline int msg_is_syn(struct tipc_msg *m) { return msg_bits(m, 0, 17, 1); } static inline void msg_set_syn(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 17, 1, d); } static inline int msg_dest_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_is_keepalive(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_src_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 18, 1, d); } static inline int msg_ack_required(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_ack_required(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline int msg_nagle_ack(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_nagle_ack(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); } static inline void msg_set_is_rcast(struct tipc_msg *m, bool d) { msg_set_bits(m, 0, 18, 0x1, d); } static inline void msg_set_size(struct tipc_msg *m, u32 sz) { m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); } static inline unchar *msg_data(struct tipc_msg *m) { return ((unchar *)m) + msg_hdr_sz(m); } static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m) { return (struct tipc_msg *)msg_data(m); } /* * Word 1 */ static inline u32 msg_type(struct tipc_msg *m) { return msg_bits(m, 1, 29, 0x7); } static inline void msg_set_type(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 29, 0x7, n); } static inline int msg_in_group(struct tipc_msg *m) { int mtyp = msg_type(m); return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG; } static inline bool msg_is_grp_evt(struct tipc_msg *m) { return msg_type(m) == TIPC_GRP_MEMBER_EVT; } static inline u32 msg_named(struct tipc_msg *m) { return msg_type(m) == TIPC_NAMED_MSG; } static inline u32 msg_mcast(struct tipc_msg *m) { int mtyp = msg_type(m); return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) { return msg_type(m) == TIPC_CONN_MSG; } static inline u32 msg_direct(struct tipc_msg *m) { return msg_type(m) == TIPC_DIRECT_MSG; } static inline u32 msg_errcode(struct tipc_msg *m) { return msg_bits(m, 1, 25, 0xf); } static inline void msg_set_errcode(struct tipc_msg *m, u32 err) { msg_set_bits(m, 1, 25, 0xf, err); } static inline void msg_set_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 28, 0x1, 1); } static inline u32 msg_is_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 28, 0x1); } static inline void msg_set_last_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 27, 0x1, 1); } static inline u32 msg_is_last_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 27, 0x1); } static inline void msg_set_non_legacy(struct tipc_msg *m) { msg_set_bits(m, 1, 26, 0x1, 1); } static inline u32 msg_is_legacy(struct tipc_msg *m) { return !msg_bits(m, 1, 26, 0x1); } static inline u32 msg_reroute_cnt(struct tipc_msg *m) { return msg_bits(m, 1, 21, 0xf); } static inline void msg_incr_reroute_cnt(struct tipc_msg *m) { msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1); } static inline u32 msg_lookup_scope(struct tipc_msg *m) { return msg_bits(m, 1, 19, 0x3); } static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 19, 0x3, n); } static inline u16 msg_bcast_ack(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* Note: reusing bits in word 1 for ACTIVATE_MSG only, to re-synch * link peer session number */ static inline bool msg_dest_session_valid(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1); } static inline void msg_set_dest_session_valid(struct tipc_msg *m, bool valid) { msg_set_bits(m, 1, 16, 0x1, valid); } static inline u16 msg_dest_session(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_dest_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* * Word 2 */ static inline u16 msg_ack(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } static inline void msg_set_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u16 msg_seqno(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Words 3-10 */ static inline u32 msg_importance(struct tipc_msg *m) { int usr = msg_user(m); if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) return usr; if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) return msg_bits(m, 9, 0, 0x7); return TIPC_SYSTEM_IMPORTANCE; } static inline void msg_set_importance(struct tipc_msg *m, u32 i) { int usr = msg_user(m); if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) msg_set_bits(m, 9, 0, 0x7, i); else if (i < TIPC_SYSTEM_IMPORTANCE) msg_set_user(m, i); else pr_warn("Trying to set illegal importance in message\n"); } static inline u32 msg_prevnode(struct tipc_msg *m) { return msg_word(m, 3); } static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 3, a); } static inline u32 msg_origport(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = msg_inner_hdr(m); return msg_word(m, 4); } static inline void msg_set_origport(struct tipc_msg *m, u32 p) { msg_set_word(m, 4, p); } static inline u16 msg_named_seqno(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_named_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_destport(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_destport(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline u32 msg_mc_netid(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline int msg_short(struct tipc_msg *m) { return msg_hdr_sz(m) == SHORT_H_SIZE; } static inline u32 msg_orignode(struct tipc_msg *m) { if (likely(msg_short(m))) return msg_prevnode(m); return msg_word(m, 6); } static inline void msg_set_orignode(struct tipc_msg *m, u32 a) { msg_set_word(m, 6, a); } static inline u32 msg_destnode(struct tipc_msg *m) { return msg_word(m, 7); } static inline void msg_set_destnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 7, a); } static inline u32 msg_nametype(struct tipc_msg *m) { return msg_word(m, 8); } static inline void msg_set_nametype(struct tipc_msg *m, u32 n) { msg_set_word(m, 8, n); } static inline u32 msg_nameinst(struct tipc_msg *m) { return msg_word(m, 9); } static inline u32 msg_namelower(struct tipc_msg *m) { return msg_nameinst(m); } static inline void msg_set_namelower(struct tipc_msg *m, u32 n) { msg_set_word(m, 9, n); } static inline void msg_set_nameinst(struct tipc_msg *m, u32 n) { msg_set_namelower(m, n); } static inline u32 msg_nameupper(struct tipc_msg *m) { return msg_word(m, 10); } static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) { msg_set_word(m, 10, n); } /* * Constants and routines used to read and write TIPC internal message headers */ /* * Connection management protocol message types */ #define CONN_PROBE 0 #define CONN_PROBE_REPLY 1 #define CONN_ACK 2 /* * Name distributor message types */ #define PUBLICATION 0 #define WITHDRAWAL 1 /* * Segmentation message types */ #define FIRST_FRAGMENT 0 #define FRAGMENT 1 #define LAST_FRAGMENT 2 /* * Link management protocol message types */ #define STATE_MSG 0 #define RESET_MSG 1 #define ACTIVATE_MSG 2 /* * Changeover tunnel message types */ #define SYNCH_MSG 0 #define FAILOVER_MSG 1 /* * Config protocol message types */ #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 #define DSC_TRIAL_MSG 2 #define DSC_TRIAL_FAIL_MSG 3 /* * Group protocol message types */ #define GRP_JOIN_MSG 0 #define GRP_LEAVE_MSG 1 #define GRP_ADV_MSG 2 #define GRP_ACK_MSG 3 #define GRP_RECLAIM_MSG 4 #define GRP_REMIT_MSG 5 /* Crypto message types */ #define KEY_DISTR_MSG 0 /* * Word 1 */ static inline u32 msg_seq_gap(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1fff); } static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 16, 0x1fff, n); } static inline u32 msg_node_sig(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_node_sig(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 0, 0xffff, n); } static inline u32 msg_node_capabilities(struct tipc_msg *m) { return msg_bits(m, 1, 15, 0x1fff); } static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 15, 0x1fff, n); } /* * Word 2 */ static inline u32 msg_dest_domain(struct tipc_msg *m) { return msg_word(m, 2); } static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n) { msg_set_word(m, 2, n); } static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u32 msg_bcgap_to(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Word 4 */ static inline u32 msg_last_bcast(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline u32 msg_bc_snd_nxt(struct tipc_msg *m) { return msg_last_bcast(m) + 1; } static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u32 msg_nof_fragms(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_nof_fragms(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_fragm_no(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u16 msg_next_sent(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_next_sent(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_bc_netid(struct tipc_msg *m) { return msg_word(m, 4); } static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id) { msg_set_word(m, 4, id); } static inline u32 msg_link_selector(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = (void *)msg_data(m); return msg_bits(m, 4, 0, 1); } /* * Word 5 */ static inline u16 msg_session(struct tipc_msg *m) { return msg_bits(m, 5, 16, 0xffff); } static inline void msg_set_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 5, 16, 0xffff, n); } static inline u32 msg_probe(struct tipc_msg *m) { return msg_bits(m, 5, 0, 1); } static inline void msg_set_probe(struct tipc_msg *m, u32 val) { msg_set_bits(m, 5, 0, 1, val); } static inline char msg_net_plane(struct tipc_msg *m) { return msg_bits(m, 5, 1, 7) + 'A'; } static inline void msg_set_net_plane(struct tipc_msg *m, char n) { msg_set_bits(m, 5, 1, 7, (n - 'A')); } static inline u32 msg_linkprio(struct tipc_msg *m) { return msg_bits(m, 5, 4, 0x1f); } static inline void msg_set_linkprio(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 4, 0x1f, n); } static inline u32 msg_bearer_id(struct tipc_msg *m) { return msg_bits(m, 5, 9, 0x7); } static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 9, 0x7, n); } static inline u32 msg_redundant_link(struct tipc_msg *m) { return msg_bits(m, 5, 12, 0x1); } static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r) { msg_set_bits(m, 5, 12, 0x1, r); } static inline u32 msg_peer_stopping(struct tipc_msg *m) { return msg_bits(m, 5, 13, 0x1); } static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s) { msg_set_bits(m, 5, 13, 0x1, s); } static inline bool msg_bc_ack_invalid(struct tipc_msg *m) { switch (msg_user(m)) { case BCAST_PROTOCOL: case NAME_DISTRIBUTOR: case LINK_PROTOCOL: return msg_bits(m, 5, 14, 0x1); default: return false; } } static inline void msg_set_bc_ack_invalid(struct tipc_msg *m, bool invalid) { msg_set_bits(m, 5, 14, 0x1, invalid); } static inline char *msg_media_addr(struct tipc_msg *m) { return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; } static inline u32 msg_bc_gap(struct tipc_msg *m) { return msg_bits(m, 8, 0, 0x3ff); } static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 8, 0, 0x3ff, n); } /* * Word 9 */ static inline u16 msg_msgcnt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_adv_win(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_adv_win(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u32 msg_max_pkt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff) * 4; } static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, (n / 4)); } static inline u32 msg_link_tolerance(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_bc_acked(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_remitted(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } /* Word 10 */ static inline u16 msg_grp_evt(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x3); } static inline void msg_set_grp_evt(struct tipc_msg *m, int n) { msg_set_bits(m, 10, 0, 0x3, n); } static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x1); } static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n) { msg_set_bits(m, 10, 0, 0x1, n); } static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) { return msg_bits(m, 10, 16, 0xffff); } static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n) { msg_set_bits(m, 10, 16, 0xffff, n); } static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) return true; if (msg_type(m) == STATE_MSG) return true; return false; } static inline bool msg_peer_node_is_up(struct tipc_msg *m) { if (msg_peer_link_is_up(m)) return true; return msg_redundant_link(m); } static inline bool msg_is_reset(struct tipc_msg *hdr) { return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } /* Word 13 */ static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) { msg_set_word(m, 13, n); } static inline u32 msg_peer_net_hash(struct tipc_msg *m) { return msg_word(m, 13); } /* Word 14 */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); } static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n) { msg_set_word(m, 14, n); } static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id) { memcpy(msg_data(hdr), id, 16); } static inline u8 *msg_node_id(struct tipc_msg *hdr) { return (u8 *)msg_data(hdr); } struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, u32 dnode, bool *new_bundle); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy); static inline u16 buf_seqno(struct sk_buff *skb) { return msg_seqno(buf_msg(skb)); } static inline int buf_roundup_len(struct sk_buff *skb) { return (skb->len / 1024 + 1) * 1024; } /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any */ static inline struct sk_buff *tipc_skb_peek(struct sk_buff_head *list, spinlock_t *lock) { struct sk_buff *skb; spin_lock_bh(lock); skb = skb_peek(list); if (skb) skb_get(skb); spin_unlock_bh(lock); return skb; } /* tipc_skb_peek_port(): find a destination port, ignoring all destinations * up to and including 'filter'. * Note: ignoring previously tried destinations minimizes the risk of * contention on the socket lock * @list: list to be peeked in * @filter: last destination to be ignored from search * Returns a destination port number, of applicable. */ static inline u32 tipc_skb_peek_port(struct sk_buff_head *list, u32 filter) { struct sk_buff *skb; u32 dport = 0; bool ignore = true; spin_lock_bh(&list->lock); skb_queue_walk(list, skb) { dport = msg_destport(buf_msg(skb)); if (!filter || skb_queue_is_last(list, skb)) break; if (dport == filter) ignore = false; else if (!ignore) break; } spin_unlock_bh(&list->lock); return dport; } /* tipc_skb_dequeue(): unlink first buffer with dest 'dport' from list * @list: list to be unlinked from * @dport: selection criteria for buffer to unlink */ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list, u32 dport) { struct sk_buff *_skb, *tmp, *skb = NULL; spin_lock_bh(&list->lock); skb_queue_walk_safe(list, _skb, tmp) { if (msg_destport(buf_msg(_skb)) == dport) { __skb_unlink(_skb, list); skb = _skb; break; } } spin_unlock_bh(&list->lock); return skb; } /* tipc_skb_queue_splice_tail - append an skb list to lock protected list * @list: the new list to append. Not lock protected * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list, struct sk_buff_head *head) { spin_lock_bh(&head->lock); skb_queue_splice_tail(list, head); spin_unlock_bh(&head->lock); } /* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists * @list: the new list to add. Lock protected. Will be reinitialized * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { struct sk_buff_head tmp; __skb_queue_head_init(&tmp); spin_lock_bh(&list->lock); skb_queue_splice_tail_init(list, &tmp); spin_unlock_bh(&list->lock); tipc_skb_queue_splice_tail(&tmp, head); } /* __tipc_skb_dequeue() - dequeue the head skb according to expected seqno * @list: list to be dequeued from * @seqno: seqno of the expected msg * * returns skb dequeued from the list if its seqno is less than or equal to * the expected one, otherwise the skb is still hold * * Note: must be used with appropriate locks held only */ static inline struct sk_buff *__tipc_skb_dequeue(struct sk_buff_head *list, u16 seqno) { struct sk_buff *skb = skb_peek(list); if (skb && less_eq(buf_seqno(skb), seqno)) { __skb_unlink(skb, list); return skb; } return NULL; } #endif |
| 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/buffer_head.h> #include <linux/delay.h> #include <linux/sort.h> #include <linux/hash.h> #include <linux/jhash.h> #include <linux/kallsyms.h> #include <linux/gfs2_ondisk.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/module.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/workqueue.h> #include <linux/jiffies.h> #include <linux/rcupdate.h> #include <linux/rculist_bl.h> #include <linux/bit_spinlock.h> #include <linux/percpu.h> #include <linux/list_sort.h> #include <linux/lockref.h> #include <linux/rhashtable.h> #include <linux/pid_namespace.h> #include <linux/file.h> #include <linux/random.h> #include "gfs2.h" #include "incore.h" #include "glock.h" #include "glops.h" #include "inode.h" #include "lops.h" #include "meta_io.h" #include "quota.h" #include "super.h" #include "util.h" #include "bmap.h" #define CREATE_TRACE_POINTS #include "trace_gfs2.h" struct gfs2_glock_iter { struct gfs2_sbd *sdp; /* incore superblock */ struct rhashtable_iter hti; /* rhashtable iterator */ struct gfs2_glock *gl; /* current glock struct */ loff_t last_pos; /* last position */ }; typedef void (*glock_examiner) (struct gfs2_glock * gl); static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); static void request_demote(struct gfs2_glock *gl, unsigned int state, unsigned long delay, bool remote); static struct dentry *gfs2_root; static LIST_HEAD(lru_list); static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) static const struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), .head_offset = offsetof(struct gfs2_glock, gl_node), }; static struct rhashtable gl_hash_table; #define GLOCK_WAIT_TABLE_BITS 12 #define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS) static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned; struct wait_glock_queue { struct lm_lockname *name; wait_queue_entry_t wait; }; static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct wait_glock_queue *wait_glock = container_of(wait, struct wait_glock_queue, wait); struct lm_lockname *wait_name = wait_glock->name; struct lm_lockname *wake_name = key; if (wake_name->ln_sbd != wait_name->ln_sbd || wake_name->ln_number != wait_name->ln_number || wake_name->ln_type != wait_name->ln_type) return 0; return autoremove_wake_function(wait, mode, sync, key); } static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) { u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0); return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); } /** * wake_up_glock - Wake up waiters on a glock * @gl: the glock */ static void wake_up_glock(struct gfs2_glock *gl) { wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name); if (waitqueue_active(wq)) __wake_up(wq, TASK_NORMAL, 1, &gl->gl_name); } static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); kfree(gl->gl_lksb.sb_lvbptr); if (gl->gl_ops->go_flags & GLOF_ASPACE) { struct gfs2_glock_aspace *gla = container_of(gl, struct gfs2_glock_aspace, glock); kmem_cache_free(gfs2_glock_aspace_cachep, gla); } else kmem_cache_free(gfs2_glock_cachep, gl); } static void __gfs2_glock_free(struct gfs2_glock *gl) { rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); smp_mb(); wake_up_glock(gl); call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); } void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; __gfs2_glock_free(gl); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_kill_wait); } void gfs2_glock_free_later(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; spin_lock(&lru_lock); list_add(&gl->gl_lru, &sdp->sd_dead_glocks); spin_unlock(&lru_lock); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_kill_wait); } static void gfs2_free_dead_glocks(struct gfs2_sbd *sdp) { struct list_head *list = &sdp->sd_dead_glocks; while(!list_empty(list)) { struct gfs2_glock *gl; gl = list_first_entry(list, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); __gfs2_glock_free(gl); } } /** * gfs2_glock_hold() - increment reference count on glock * @gl: The glock to hold * */ struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); lockref_get(&gl->gl_lockref); return gl; } static void gfs2_glock_add_to_lru(struct gfs2_glock *gl) { spin_lock(&lru_lock); list_move_tail(&gl->gl_lru, &lru_list); if (!test_bit(GLF_LRU, &gl->gl_flags)) { set_bit(GLF_LRU, &gl->gl_flags); atomic_inc(&lru_count); } spin_unlock(&lru_lock); } static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { spin_lock(&lru_lock); if (test_bit(GLF_LRU, &gl->gl_flags)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); } spin_unlock(&lru_lock); } /* * Enqueue the glock on the work queue. Passes one glock reference on to the * work queue. */ static void gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!queue_delayed_work(sdp->sd_glock_wq, &gl->gl_work, delay)) { /* * We are holding the lockref spinlock, and the work was still * queued above. The queued work (glock_work_func) takes that * spinlock before dropping its glock reference(s), so it * cannot have dropped them in the meantime. */ GLOCK_BUG_ON(gl, gl->gl_lockref.count < 2); gl->gl_lockref.count--; } } static void __gfs2_glock_put(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); lockref_mark_dead(&gl->gl_lockref); spin_unlock(&gl->gl_lockref.lock); gfs2_glock_remove_from_lru(gl); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); if (mapping) { truncate_inode_pages_final(mapping); if (!gfs2_withdrawn(sdp)) GLOCK_BUG_ON(gl, !mapping_empty(mapping)); } trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } static bool __gfs2_glock_put_or_lock(struct gfs2_glock *gl) { if (lockref_put_or_lock(&gl->gl_lockref)) return true; GLOCK_BUG_ON(gl, gl->gl_lockref.count != 1); if (gl->gl_state != LM_ST_UNLOCKED) { gl->gl_lockref.count--; gfs2_glock_add_to_lru(gl); spin_unlock(&gl->gl_lockref.lock); return true; } return false; } /** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put * */ void gfs2_glock_put(struct gfs2_glock *gl) { if (__gfs2_glock_put_or_lock(gl)) return; __gfs2_glock_put(gl); } /* * gfs2_glock_put_async - Decrement reference count without sleeping * @gl: The glock to put * * Decrement the reference count on glock immediately unless it is the last * reference. Defer putting the last reference to work queue context. */ void gfs2_glock_put_async(struct gfs2_glock *gl) { if (__gfs2_glock_put_or_lock(gl)) return; gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); } /** * may_grant - check if it's ok to grant a new lock * @gl: The glock * @current_gh: One of the current holders of @gl * @gh: The lock request which we wish to grant * * With our current compatibility rules, if a glock has one or more active * holders (HIF_HOLDER flag set), any of those holders can be passed in as * @current_gh; they are all the same as far as compatibility with the new @gh * goes. * * Returns true if it's ok to grant the lock. */ static inline bool may_grant(struct gfs2_glock *gl, struct gfs2_holder *current_gh, struct gfs2_holder *gh) { if (current_gh) { GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, ¤t_gh->gh_iflags)); switch(current_gh->gh_state) { case LM_ST_EXCLUSIVE: /* * Here we make a special exception to grant holders * who agree to share the EX lock with other holders * who also have the bit set. If the original holder * has the LM_FLAG_NODE_SCOPE bit set, we grant more * holders with the bit set. */ return gh->gh_state == LM_ST_EXCLUSIVE && (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) && (gh->gh_flags & LM_FLAG_NODE_SCOPE); case LM_ST_SHARED: case LM_ST_DEFERRED: return gh->gh_state == current_gh->gh_state; default: return false; } } if (gl->gl_state == gh->gh_state) return true; if (gh->gh_flags & GL_EXACT) return false; if (gl->gl_state == LM_ST_EXCLUSIVE) { return gh->gh_state == LM_ST_SHARED || gh->gh_state == LM_ST_DEFERRED; } if (gh->gh_flags & LM_FLAG_ANY) return gl->gl_state != LM_ST_UNLOCKED; return false; } static void gfs2_holder_wake(struct gfs2_holder *gh) { clear_bit(HIF_WAIT, &gh->gh_iflags); smp_mb__after_atomic(); wake_up_bit(&gh->gh_iflags, HIF_WAIT); if (gh->gh_flags & GL_ASYNC) { struct gfs2_sbd *sdp = gh->gh_gl->gl_name.ln_sbd; wake_up(&sdp->sd_async_glock_wait); } } /** * do_error - Something unexpected has happened during a lock request * @gl: The glock * @ret: The status from the DLM */ static void do_error(struct gfs2_glock *gl, const int ret) { struct gfs2_holder *gh, *tmp; list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (ret & LM_OUT_ERROR) gh->gh_error = -EIO; else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) gh->gh_error = GLR_TRYFAILED; else continue; list_del_init(&gh->gh_list); trace_gfs2_glock_queue(gh, 0); gfs2_holder_wake(gh); } } /** * find_first_holder - find the first "holder" gh * @gl: the glock */ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) { struct gfs2_holder *gh; if (!list_empty(&gl->gl_holders)) { gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); if (test_bit(HIF_HOLDER, &gh->gh_iflags)) return gh; } return NULL; } /* * gfs2_instantiate - Call the glops instantiate function * @gh: The glock holder * * Returns: 0 if instantiate was successful, or error. */ int gfs2_instantiate(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; const struct gfs2_glock_operations *glops = gl->gl_ops; int ret; again: if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags)) goto done; /* * Since we unlock the lockref lock, we set a flag to indicate * instantiate is in progress. */ if (test_and_set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) { wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG, TASK_UNINTERRUPTIBLE); /* * Here we just waited for a different instantiate to finish. * But that may not have been successful, as when a process * locks an inode glock _before_ it has an actual inode to * instantiate into. So we check again. This process might * have an inode to instantiate, so might be successful. */ goto again; } ret = glops->go_instantiate(gl); if (!ret) clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); if (ret) return ret; done: if (glops->go_held) return glops->go_held(gh); return 0; } /** * do_promote - promote as many requests as possible on the current queue * @gl: The glock */ static void do_promote(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_holder *gh, *current_gh; if (gfs2_withdrawn(sdp)) { do_error(gl, LM_OUT_ERROR); return; } current_gh = find_first_holder(gl); list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (!may_grant(gl, current_gh, gh)) { /* * If we get here, it means we may not grant this * holder for some reason. */ if (current_gh) do_error(gl, 0); /* Fail queued try locks */ break; } set_bit(HIF_HOLDER, &gh->gh_iflags); trace_gfs2_promote(gh); gfs2_holder_wake(gh); if (!current_gh) current_gh = gh; } } /** * find_first_waiter - find the first gh that's waiting for the glock * @gl: the glock */ static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) { struct gfs2_holder *gh; list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) return gh; } return NULL; } /** * find_last_waiter - find the last gh that's waiting for the glock * @gl: the glock * * This also is a fast way of finding out if there are any waiters. */ static inline struct gfs2_holder *find_last_waiter(const struct gfs2_glock *gl) { struct gfs2_holder *gh; if (list_empty(&gl->gl_holders)) return NULL; gh = list_last_entry(&gl->gl_holders, struct gfs2_holder, gh_list); return test_bit(HIF_HOLDER, &gh->gh_iflags) ? NULL : gh; } /** * state_change - record that the glock is now in a different state * @gl: the glock * @new_state: the new state */ static void state_change(struct gfs2_glock *gl, unsigned int new_state) { if (new_state != gl->gl_target) /* shorten our minimum hold time */ gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, GL_GLOCK_MIN_HOLD); gl->gl_state = new_state; gl->gl_tchange = jiffies; } static void gfs2_set_demote(int nr, struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; set_bit(nr, &gl->gl_flags); smp_mb(); wake_up(&sdp->sd_async_glock_wait); } static void gfs2_demote_wake(struct gfs2_glock *gl) { gl->gl_demote_state = LM_ST_EXCLUSIVE; clear_bit(GLF_DEMOTE, &gl->gl_flags); smp_mb__after_atomic(); wake_up_bit(&gl->gl_flags, GLF_DEMOTE); } /** * finish_xmote - The DLM has replied to one of our lock requests * @gl: The glock * @ret: The status from the DLM * */ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) { const struct gfs2_glock_operations *glops = gl->gl_ops; if (!(ret & ~LM_OUT_ST_MASK)) { unsigned state = ret & LM_OUT_ST_MASK; trace_gfs2_glock_state_change(gl, state); state_change(gl, state); } /* Demote to UN request arrived during demote to SH or DF */ if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED) gl->gl_target = LM_ST_UNLOCKED; /* Check for state != intended state */ if (unlikely(gl->gl_state != gl->gl_target)) { struct gfs2_holder *gh = find_first_waiter(gl); if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { if (ret & LM_OUT_CANCELED) { list_del_init(&gh->gh_list); trace_gfs2_glock_queue(gh, 0); gfs2_holder_wake(gh); gl->gl_target = gl->gl_state; goto out; } /* Some error or failed "try lock" - report it */ if ((ret & LM_OUT_ERROR) || (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { gl->gl_target = gl->gl_state; do_error(gl, ret); goto out; } } switch(gl->gl_state) { /* Unlocked due to conversion deadlock, try again */ case LM_ST_UNLOCKED: do_xmote(gl, gh, gl->gl_target); break; /* Conversion fails, unlock and try again */ case LM_ST_SHARED: case LM_ST_DEFERRED: do_xmote(gl, gh, LM_ST_UNLOCKED); break; default: /* Everything else */ fs_err(gl->gl_name.ln_sbd, "glock %u:%llu requested=%u ret=%u\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gl->gl_req, ret); GLOCK_BUG_ON(gl, 1); } return; } /* Fast path - we got what we asked for */ if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_demote_wake(gl); } if (gl->gl_state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { int rv; spin_unlock(&gl->gl_lockref.lock); rv = glops->go_xmote_bh(gl); spin_lock(&gl->gl_lockref.lock); if (rv) { do_error(gl, rv); goto out; } } do_promote(gl); } out: if (!test_bit(GLF_CANCELING, &gl->gl_flags)) clear_bit(GLF_LOCK, &gl->gl_flags); } /** * do_xmote - Calls the DLM to change the state of a lock * @gl: The lock state * @gh: The holder (only for promotes) * @target: The target lock state * */ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; int ret; /* * When a filesystem is withdrawing, the remaining cluster nodes will * take care of recovering the withdrawing node's journal. We only * need to make sure that once we trigger remote recovery, we won't * write to the shared block device anymore. This means that here, * * - no new writes to the filesystem must be triggered (->go_sync()). * * - any cached data should be discarded by calling ->go_inval(), dirty * or not and journaled or unjournaled. * * - no more dlm locking operations should be issued (->lm_lock()). */ GLOCK_BUG_ON(gl, gl->gl_state == target); GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if (!glops->go_inval || !glops->go_sync) goto skip_inval; spin_unlock(&gl->gl_lockref.lock); if (!gfs2_withdrawn(sdp)) { ret = glops->go_sync(gl); if (ret) { if (cmpxchg(&sdp->sd_log_error, 0, ret)) { fs_err(sdp, "Error %d syncing glock\n", ret); gfs2_dump_glock(NULL, gl, true); gfs2_withdraw(sdp); } } } if (target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); spin_lock(&gl->gl_lockref.lock); skip_inval: if (gfs2_withdrawn(sdp)) { if (target != LM_ST_UNLOCKED) target = LM_OUT_ERROR; goto out; } if (ls->ls_ops->lm_lock) { set_bit(GLF_PENDING_REPLY, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); ret = ls->ls_ops->lm_lock(gl, target, gh ? gh->gh_flags : 0); spin_lock(&gl->gl_lockref.lock); if (!ret) { /* The operation will be completed asynchronously. */ gl->gl_lockref.count++; return; } clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); if (ret == -ENODEV) { /* * The lockspace has been released and the lock has * been unlocked implicitly. */ if (target != LM_ST_UNLOCKED) { target = LM_OUT_ERROR; goto out; } } else { fs_err(sdp, "lm_lock ret %d\n", ret); GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp)); return; } } out: /* Complete the operation now. */ finish_xmote(gl, target); gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); } /** * run_queue - do all outstanding tasks related to a glock * @gl: The glock in question * @nonblock: True if we must not block in run_queue * */ static void run_queue(struct gfs2_glock *gl, const int nonblock) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { struct gfs2_holder *gh; if (test_bit(GLF_LOCK, &gl->gl_flags)) return; set_bit(GLF_LOCK, &gl->gl_flags); /* * The GLF_DEMOTE_IN_PROGRESS flag is only set intermittently during * locking operations. We have just started a locking operation by * setting the GLF_LOCK flag, so the GLF_DEMOTE_IN_PROGRESS flag must * be cleared. */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { if (gl->gl_demote_state == gl->gl_state) { gfs2_demote_wake(gl); goto promote; } if (find_first_holder(gl)) goto out_unlock; if (nonblock) goto out_sched; set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; do_xmote(gl, NULL, gl->gl_target); return; } promote: do_promote(gl); if (find_first_holder(gl)) goto out_unlock; gh = find_first_waiter(gl); if (!gh) goto out_unlock; if (nonblock) goto out_sched; gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ do_xmote(gl, gh, gl->gl_target); return; out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); return; out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); } /** * glock_set_object - set the gl_object field of a glock * @gl: the glock * @object: the object */ void glock_set_object(struct gfs2_glock *gl, void *object) { void *prev_object; spin_lock(&gl->gl_lockref.lock); prev_object = gl->gl_object; gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) gfs2_dump_glock(NULL, gl, true); } /** * glock_clear_object - clear the gl_object field of a glock * @gl: the glock * @object: object the glock currently points at */ void glock_clear_object(struct gfs2_glock *gl, void *object) { void *prev_object; spin_lock(&gl->gl_lockref.lock); prev_object = gl->gl_object; gl->gl_object = NULL; spin_unlock(&gl->gl_lockref.lock); if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) gfs2_dump_glock(NULL, gl, true); } void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) { struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; if (ri->ri_magic == 0) ri->ri_magic = cpu_to_be32(GFS2_MAGIC); if (ri->ri_magic == cpu_to_be32(GFS2_MAGIC)) ri->ri_generation_deleted = cpu_to_be64(generation); } bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation) { struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; if (ri->ri_magic != cpu_to_be32(GFS2_MAGIC)) return false; return generation <= be64_to_cpu(ri->ri_generation_deleted); } static void gfs2_glock_poke(struct gfs2_glock *gl) { int flags = LM_FLAG_TRY_1CB | LM_FLAG_ANY | GL_SKIP; struct gfs2_holder gh; int error; __gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh, _RET_IP_); error = gfs2_glock_nq(&gh); if (!error) gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); } static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl) { struct gfs2_inode *ip; spin_lock(&gl->gl_lockref.lock); ip = gl->gl_object; if (ip && !igrab(&ip->i_inode)) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { wait_on_new_inode(&ip->i_inode); if (is_bad_inode(&ip->i_inode)) { iput(&ip->i_inode); ip = NULL; } } return ip; } static void gfs2_try_to_evict(struct gfs2_glock *gl) { struct gfs2_inode *ip; /* * If there is contention on the iopen glock and we have an inode, try * to grab and release the inode so that it can be evicted. The * GLF_DEFER_DELETE flag indicates to gfs2_evict_inode() that the inode * should not be deleted locally. This will allow the remote node to * go ahead and delete the inode without us having to do it, which will * avoid rgrp glock thrashing. * * The remote node is likely still holding the corresponding inode * glock, so it will run before we get to verify that the delete has * happened below. (Verification is triggered by the call to * gfs2_queue_verify_delete() in gfs2_evict_inode().) */ ip = gfs2_grab_existing_inode(gl); if (ip) { set_bit(GLF_DEFER_DELETE, &gl->gl_flags); d_prune_aliases(&ip->i_inode); iput(&ip->i_inode); clear_bit(GLF_DEFER_DELETE, &gl->gl_flags); /* If the inode was evicted, gl->gl_object will now be NULL. */ ip = gfs2_grab_existing_inode(gl); if (ip) { gfs2_glock_poke(ip->i_gl); iput(&ip->i_inode); } } } bool gfs2_queue_try_to_evict(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (test_and_set_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) return false; return !mod_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, 0); } bool gfs2_queue_verify_delete(struct gfs2_glock *gl, bool later) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned long delay; if (test_and_set_bit(GLF_VERIFY_DELETE, &gl->gl_flags)) return false; delay = later ? HZ + get_random_long() % (HZ * 9) : 0; return queue_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, delay); } static void delete_work_func(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct gfs2_glock *gl = container_of(dwork, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; bool verify_delete = test_and_clear_bit(GLF_VERIFY_DELETE, &gl->gl_flags); /* * Check for the GLF_VERIFY_DELETE above: this ensures that we won't * immediately process GLF_VERIFY_DELETE work that the below call to * gfs2_try_to_evict() queues. */ if (test_and_clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) gfs2_try_to_evict(gl); if (verify_delete) { u64 no_addr = gl->gl_name.ln_number; struct inode *inode; inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, GFS2_BLKST_UNLINKED); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -EAGAIN && !test_bit(SDF_KILL, &sdp->sd_flags) && gfs2_queue_verify_delete(gl, true)) return; } else { d_prune_aliases(inode); iput(inode); } } gfs2_glock_put(gl); } static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); unsigned int drop_refs = 1; spin_lock(&gl->gl_lockref.lock); if (test_bit(GLF_HAVE_REPLY, &gl->gl_flags)) { clear_bit(GLF_HAVE_REPLY, &gl->gl_flags); finish_xmote(gl, gl->gl_reply); drop_refs++; } if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { if (gl->gl_name.ln_type == LM_TYPE_INODE) { unsigned long holdtime, now = jiffies; holdtime = gl->gl_tchange + gl->gl_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; } if (!delay) { clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); gfs2_set_demote(GLF_DEMOTE, gl); } } run_queue(gl, 0); if (delay) { /* Keep one glock reference for the work we requeue. */ drop_refs--; gfs2_glock_queue_work(gl, delay); } /* Drop the remaining glock references manually. */ GLOCK_BUG_ON(gl, gl->gl_lockref.count < drop_refs); gl->gl_lockref.count -= drop_refs; if (!gl->gl_lockref.count) { if (gl->gl_state == LM_ST_UNLOCKED) { __gfs2_glock_put(gl); return; } gfs2_glock_add_to_lru(gl); } spin_unlock(&gl->gl_lockref.lock); } static struct gfs2_glock *find_insert_glock(struct lm_lockname *name, struct gfs2_glock *new) { struct wait_glock_queue wait; wait_queue_head_t *wq = glock_waitqueue(name); struct gfs2_glock *gl; wait.name = name; init_wait(&wait.wait); wait.wait.func = glock_wake_function; again: prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); rcu_read_lock(); if (new) { gl = rhashtable_lookup_get_insert_fast(&gl_hash_table, &new->gl_node, ht_parms); if (IS_ERR(gl)) goto out; } else { gl = rhashtable_lookup_fast(&gl_hash_table, name, ht_parms); } if (gl && !lockref_get_not_dead(&gl->gl_lockref)) { rcu_read_unlock(); schedule(); goto again; } out: rcu_read_unlock(); finish_wait(wq, &wait.wait); if (gl) gfs2_glock_remove_from_lru(gl); return gl; } /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock * @number: the lock number * @glops: The glock_operations to use * @create: If 0, don't create the glock if it doesn't exist * @glp: the glock is returned here * * This does not lock a glock, just finds/creates structures for one. * * Returns: errno */ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp) { struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type, .ln_sbd = sdp }; struct gfs2_glock *gl, *tmp; struct address_space *mapping; gl = find_insert_glock(&name, NULL); if (gl) goto found; if (!create) return -ENOENT; if (glops->go_flags & GLOF_ASPACE) { struct gfs2_glock_aspace *gla = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_NOFS); if (!gla) return -ENOMEM; gl = &gla->glock; } else { gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_NOFS); if (!gl) return -ENOMEM; } memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); gl->gl_ops = glops; if (glops->go_flags & GLOF_LVB) { gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { gfs2_glock_dealloc(&gl->gl_rcu); return -ENOMEM; } } atomic_inc(&sdp->sd_glock_disposal); gl->gl_node.next = NULL; gl->gl_flags = BIT(GLF_INITIAL); if (glops->go_instantiate) gl->gl_flags |= BIT(GLF_INSTANTIATE_NEEDED); gl->gl_name = name; lockref_init(&gl->gl_lockref); lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass); gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; gl->gl_dstamp = 0; preempt_disable(); /* We use the global stats to estimate the initial per-glock stats */ gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type]; preempt_enable(); gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; gl->gl_tchange = jiffies; gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); if (gl->gl_name.ln_type == LM_TYPE_IOPEN) INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); mapping = gfs2_glock2aspace(gl); if (mapping) { gfp_t gfp_mask; mapping->a_ops = &gfs2_meta_aops; mapping->host = sdp->sd_inode; mapping->flags = 0; gfp_mask = mapping_gfp_mask(sdp->sd_inode->i_mapping); mapping_set_gfp_mask(mapping, gfp_mask); mapping->i_private_data = NULL; mapping->writeback_index = 0; } tmp = find_insert_glock(&name, gl); if (tmp) { gfs2_glock_dealloc(&gl->gl_rcu); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_kill_wait); if (IS_ERR(tmp)) return PTR_ERR(tmp); gl = tmp; } found: *glp = gl; return 0; } /** * __gfs2_holder_init - initialize a struct gfs2_holder in the default way * @gl: the glock * @state: the state we're requesting * @flags: the modifier flags * @gh: the holder structure * @ip: caller's return address for debugging */ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh, unsigned long ip) { INIT_LIST_HEAD(&gh->gh_list); gh->gh_gl = gfs2_glock_hold(gl); gh->gh_ip = ip; gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; gh->gh_iflags = 0; } /** * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it * @state: the state we're requesting * @flags: the modifier flags * @gh: the holder structure * * Don't mess with the glock. * */ void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh) { gh->gh_state = state; gh->gh_flags = flags; gh->gh_iflags = 0; gh->gh_ip = _RET_IP_; put_pid(gh->gh_owner_pid); gh->gh_owner_pid = get_pid(task_pid(current)); } /** * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference) * @gh: the holder structure * */ void gfs2_holder_uninit(struct gfs2_holder *gh) { put_pid(gh->gh_owner_pid); gfs2_glock_put(gh->gh_gl); gfs2_holder_mark_uninitialized(gh); gh->gh_ip = 0; } static void gfs2_glock_update_hold_time(struct gfs2_glock *gl, unsigned long start_time) { /* Have we waited longer that a second? */ if (time_after(jiffies, start_time + HZ)) { /* Lengthen the minimum hold time. */ gl->gl_hold_time = min(gl->gl_hold_time + GL_GLOCK_HOLD_INCR, GL_GLOCK_MAX_HOLD); } } /** * gfs2_glock_holder_ready - holder is ready and its error code can be collected * @gh: the glock holder * * Called when a glock holder no longer needs to be waited for because it is * now either held (HIF_HOLDER set; gh_error == 0), or acquiring the lock has * failed (gh_error != 0). */ int gfs2_glock_holder_ready(struct gfs2_holder *gh) { if (gh->gh_error || (gh->gh_flags & GL_SKIP)) return gh->gh_error; gh->gh_error = gfs2_instantiate(gh); if (gh->gh_error) gfs2_glock_dq(gh); return gh->gh_error; } /** * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * * Returns: 0 on success */ int gfs2_glock_wait(struct gfs2_holder *gh) { unsigned long start_time = jiffies; might_sleep(); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); gfs2_glock_update_hold_time(gh->gh_gl, start_time); return gfs2_glock_holder_ready(gh); } static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs) { int i; for (i = 0; i < num_gh; i++) if (test_bit(HIF_WAIT, &ghs[i].gh_iflags)) return 1; return 0; } /** * gfs2_glock_async_wait - wait on multiple asynchronous glock acquisitions * @num_gh: the number of holders in the array * @ghs: the glock holder array * * Returns: 0 on success, meaning all glocks have been granted and are held. * -ESTALE if the request timed out, meaning all glocks were released, * and the caller should retry the operation. */ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs) { struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd; int i, ret = 0, timeout = 0; unsigned long start_time = jiffies; might_sleep(); /* * Total up the (minimum hold time * 2) of all glocks and use that to * determine the max amount of time we should wait. */ for (i = 0; i < num_gh; i++) timeout += ghs[i].gh_gl->gl_hold_time << 1; if (!wait_event_timeout(sdp->sd_async_glock_wait, !glocks_pending(num_gh, ghs), timeout)) { ret = -ESTALE; /* request timed out. */ goto out; } for (i = 0; i < num_gh; i++) { struct gfs2_holder *gh = &ghs[i]; int ret2; if (test_bit(HIF_HOLDER, &gh->gh_iflags)) { gfs2_glock_update_hold_time(gh->gh_gl, start_time); } ret2 = gfs2_glock_holder_ready(gh); if (!ret) ret = ret2; } out: if (ret) { for (i = 0; i < num_gh; i++) { struct gfs2_holder *gh = &ghs[i]; gfs2_glock_dq(gh); } } return ret; } /** * request_demote - process a demote request * @gl: the glock * @state: the state the caller wants us to change to * @delay: zero to demote immediately; otherwise pending demote * @remote: true if this came from a different cluster node * * There are only two requests that we are going to see in actual * practise: LM_ST_SHARED and LM_ST_UNLOCKED */ static void request_demote(struct gfs2_glock *gl, unsigned int state, unsigned long delay, bool remote) { gfs2_set_demote(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, gl); if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; } else if (gl->gl_demote_state != LM_ST_UNLOCKED && gl->gl_demote_state != state) { gl->gl_demote_state = LM_ST_UNLOCKED; } if (gl->gl_ops->go_callback) gl->gl_ops->go_callback(gl, remote); trace_gfs2_demote_rq(gl, remote); } void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); if (seq) { seq_vprintf(seq, fmt, args); } else { vaf.fmt = fmt; vaf.va = &args; pr_err("%pV", &vaf); } va_end(args); } static bool gfs2_should_queue_trylock(struct gfs2_glock *gl, struct gfs2_holder *gh) { struct gfs2_holder *current_gh, *gh2; current_gh = find_first_holder(gl); if (current_gh && !may_grant(gl, current_gh, gh)) return false; list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) continue; if (!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) return false; } return true; } static inline bool pid_is_meaningful(const struct gfs2_holder *gh) { if (!(gh->gh_flags & GL_NOPID)) return true; return !test_bit(HIF_HOLDER, &gh->gh_iflags); } /** * add_to_queue - Add a holder to the wait queue (but look for recursion) * @gh: the holder structure to add * * Eventually we should move the recursive locking trap to a * debugging option or something like that. This is the fast * path and needs to have the minimum number of distractions. * */ static inline void add_to_queue(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_holder *gh2; GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL); if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) GLOCK_BUG_ON(gl, true); if ((gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) && !gfs2_should_queue_trylock(gl, gh)) { gh->gh_error = GLR_TRYFAILED; gfs2_holder_wake(gh); return; } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (likely(gh2->gh_owner_pid != gh->gh_owner_pid)) continue; if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK) continue; if (!pid_is_meaningful(gh2)) continue; goto trap_recursive; } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); list_add_tail(&gh->gh_list, &gl->gl_holders); return; trap_recursive: fs_err(sdp, "original: %pSR\n", (void *)gh2->gh_ip); fs_err(sdp, "pid: %d\n", pid_nr(gh2->gh_owner_pid)); fs_err(sdp, "lock type: %d req lock state : %d\n", gh2->gh_gl->gl_name.ln_type, gh2->gh_state); fs_err(sdp, "new: %pSR\n", (void *)gh->gh_ip); fs_err(sdp, "pid: %d\n", pid_nr(gh->gh_owner_pid)); fs_err(sdp, "lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); gfs2_dump_glock(NULL, gl, true); BUG(); } /** * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock) * @gh: the holder structure * * if (gh->gh_flags & GL_ASYNC), this never returns an error * * Returns: 0, GLR_TRYFAILED, or errno on failure */ int gfs2_glock_nq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; int error; if (gfs2_withdrawn(sdp)) return -EIO; if (gh->gh_flags & GL_NOBLOCK) { struct gfs2_holder *current_gh; error = -ECHILD; spin_lock(&gl->gl_lockref.lock); if (find_last_waiter(gl)) goto unlock; current_gh = find_first_holder(gl); if (!may_grant(gl, current_gh, gh)) goto unlock; set_bit(HIF_HOLDER, &gh->gh_iflags); list_add_tail(&gh->gh_list, &gl->gl_holders); trace_gfs2_promote(gh); error = 0; unlock: spin_unlock(&gl->gl_lockref.lock); return error; } gh->gh_error = 0; spin_lock(&gl->gl_lockref.lock); add_to_queue(gh); if (unlikely((LM_FLAG_RECOVER & gh->gh_flags) && test_and_clear_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags))) { set_bit(GLF_HAVE_REPLY, &gl->gl_flags); gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); } run_queue(gl, 1); spin_unlock(&gl->gl_lockref.lock); error = 0; if (!(gh->gh_flags & GL_ASYNC)) error = gfs2_glock_wait(gh); return error; } /** * gfs2_glock_poll - poll to see if an async request has been completed * @gh: the holder * * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on */ int gfs2_glock_poll(struct gfs2_holder *gh) { return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; } static void __gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; unsigned delay = 0; int fast_path = 0; /* * This holder should not be cached, so mark it for demote. * Note: this should be done before the glock_needs_demote * check below. */ if (gh->gh_flags & GL_NOCACHE) request_demote(gl, LM_ST_UNLOCKED, 0, false); list_del_init(&gh->gh_list); clear_bit(HIF_HOLDER, &gh->gh_iflags); trace_gfs2_glock_queue(gh, 0); /* * If there hasn't been a demote request we are done. * (Let the remaining holders, if any, keep holding it.) */ if (!glock_needs_demote(gl)) { if (list_empty(&gl->gl_holders)) fast_path = 1; } if (unlikely(!fast_path)) { gl->gl_lockref.count++; if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && !test_bit(GLF_DEMOTE, &gl->gl_flags) && gl->gl_name.ln_type == LM_TYPE_INODE) delay = gl->gl_hold_time; gfs2_glock_queue_work(gl, delay); } } /** * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) * @gh: the glock holder * */ void gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; spin_lock(&gl->gl_lockref.lock); if (!gfs2_holder_queued(gh)) { /* * May have already been dequeued because the locking request * was GL_ASYNC and it has failed in the meantime. */ goto out; } if (list_is_first(&gh->gh_list, &gl->gl_holders) && !test_bit(HIF_HOLDER, &gh->gh_iflags) && test_bit(GLF_LOCK, &gl->gl_flags) && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && !test_bit(GLF_CANCELING, &gl->gl_flags)) { set_bit(GLF_CANCELING, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); spin_lock(&gl->gl_lockref.lock); clear_bit(GLF_CANCELING, &gl->gl_flags); clear_bit(GLF_LOCK, &gl->gl_flags); if (!gfs2_holder_queued(gh)) goto out; } __gfs2_glock_dq(gh); out: spin_unlock(&gl->gl_lockref.lock); } void gfs2_glock_dq_wait(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); might_sleep(); wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); } /** * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it * @gh: the holder structure * */ void gfs2_glock_dq_uninit(struct gfs2_holder *gh) { gfs2_glock_dq(gh); gfs2_holder_uninit(gh); } /** * gfs2_glock_nq_num - acquire a glock based on lock number * @sdp: the filesystem * @number: the lock number * @glops: the glock operations for the type of glock * @state: the state to acquire the glock in * @flags: modifier flags for the acquisition * @gh: the struct gfs2_holder * * Returns: errno */ int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, unsigned int state, u16 flags, struct gfs2_holder *gh) { struct gfs2_glock *gl; int error; error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); if (!error) { error = gfs2_glock_nq_init(gl, state, flags, gh); gfs2_glock_put(gl); } return error; } /** * glock_compare - Compare two struct gfs2_glock structures for sorting * @arg_a: the first structure * @arg_b: the second structure * */ static int glock_compare(const void *arg_a, const void *arg_b) { const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a; const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b; const struct lm_lockname *a = &gh_a->gh_gl->gl_name; const struct lm_lockname *b = &gh_b->gh_gl->gl_name; if (a->ln_number > b->ln_number) return 1; if (a->ln_number < b->ln_number) return -1; BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type); return 0; } /** * nq_m_sync - synchronously acquire more than one glock in deadlock free order * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * @p: placeholder for the holder structure to pass back * * Returns: 0 on success (all glocks acquired), * errno on failure (no glocks acquired) */ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, struct gfs2_holder **p) { unsigned int x; int error = 0; for (x = 0; x < num_gh; x++) p[x] = &ghs[x]; sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL); for (x = 0; x < num_gh; x++) { error = gfs2_glock_nq(p[x]); if (error) { while (x--) gfs2_glock_dq(p[x]); break; } } return error; } /** * gfs2_glock_nq_m - acquire multiple glocks * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * * Returns: 0 on success (all glocks acquired), * errno on failure (no glocks acquired) */ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) { struct gfs2_holder *tmp[4]; struct gfs2_holder **pph = tmp; int error = 0; switch(num_gh) { case 0: return 0; case 1: return gfs2_glock_nq(ghs); default: if (num_gh <= 4) break; pph = kmalloc_array(num_gh, sizeof(struct gfs2_holder *), GFP_NOFS); if (!pph) return -ENOMEM; } error = nq_m_sync(num_gh, ghs, pph); if (pph != tmp) kfree(pph); return error; } /** * gfs2_glock_dq_m - release multiple glocks * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * */ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) { while (num_gh--) gfs2_glock_dq(&ghs[num_gh]); } void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { unsigned long delay = 0; gfs2_glock_hold(gl); spin_lock(&gl->gl_lockref.lock); if (!list_empty(&gl->gl_holders) && gl->gl_name.ln_type == LM_TYPE_INODE) { unsigned long now = jiffies; unsigned long holdtime; holdtime = gl->gl_tchange + gl->gl_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; if (test_bit(GLF_HAVE_REPLY, &gl->gl_flags)) delay = gl->gl_hold_time; } request_demote(gl, state, delay, true); gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); } /** * gfs2_should_freeze - Figure out if glock should be frozen * @gl: The glock in question * * Glocks are not frozen if (a) the result of the dlm operation is * an error, (b) the locking operation was an unlock operation or * (c) if there is a "recover" flagged request anywhere in the queue * * Returns: 1 if freezing should occur, 0 otherwise */ static int gfs2_should_freeze(const struct gfs2_glock *gl) { const struct gfs2_holder *gh; if (gl->gl_reply & ~LM_OUT_ST_MASK) return 0; if (gl->gl_target == LM_ST_UNLOCKED) return 0; list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (LM_FLAG_RECOVER & gh->gh_flags) return 0; } return 1; } /** * gfs2_glock_complete - Callback used by locking * @gl: Pointer to the glock * @ret: The return value from the dlm * * The gl_reply field is under the gl_lockref.lock lock so that it is ok * to use a bitfield shared with other glock state fields. */ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; spin_lock(&gl->gl_lockref.lock); clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { if (gfs2_should_freeze(gl)) { set_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); return; } } gl->gl_lockref.count++; set_bit(GLF_HAVE_REPLY, &gl->gl_flags); gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); } static int glock_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct gfs2_glock *gla, *glb; gla = list_entry(a, struct gfs2_glock, gl_lru); glb = list_entry(b, struct gfs2_glock, gl_lru); if (gla->gl_name.ln_number > glb->gl_name.ln_number) return 1; if (gla->gl_name.ln_number < glb->gl_name.ln_number) return -1; return 0; } static bool can_free_glock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; return !test_bit(GLF_LOCK, &gl->gl_flags) && !gl->gl_lockref.count && (!test_bit(GLF_LFLUSH, &gl->gl_flags) || test_bit(SDF_KILL, &sdp->sd_flags)); } /** * gfs2_dispose_glock_lru - Demote a list of glocks * @list: The list to dispose of * * Disposing of glocks may involve disk accesses, so that here we sort * the glocks by number (i.e. disk location of the inodes) so that if * there are any such accesses, they'll be sent in order (mostly). * * Must be called under the lru_lock, but may drop and retake this * lock. While the lru_lock is dropped, entries may vanish from the * list, but no new entries will appear on the list (since it is * private) */ static unsigned long gfs2_dispose_glock_lru(struct list_head *list) __releases(&lru_lock) __acquires(&lru_lock) { struct gfs2_glock *gl; unsigned long freed = 0; list_sort(NULL, list, glock_cmp); while(!list_empty(list)) { gl = list_first_entry(list, struct gfs2_glock, gl_lru); if (!spin_trylock(&gl->gl_lockref.lock)) { add_back_to_lru: list_move(&gl->gl_lru, &lru_list); continue; } if (!can_free_glock(gl)) { spin_unlock(&gl->gl_lockref.lock); goto add_back_to_lru; } list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); freed++; gl->gl_lockref.count++; if (gl->gl_state != LM_ST_UNLOCKED) request_demote(gl, LM_ST_UNLOCKED, 0, false); gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); cond_resched_lock(&lru_lock); } return freed; } /** * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote * @nr: The number of entries to scan * * This function selects the entries on the LRU which are able to * be demoted, and then kicks off the process by calling * gfs2_dispose_glock_lru() above. */ static unsigned long gfs2_scan_glock_lru(unsigned long nr) { struct gfs2_glock *gl, *next; LIST_HEAD(dispose); unsigned long freed = 0; spin_lock(&lru_lock); list_for_each_entry_safe(gl, next, &lru_list, gl_lru) { if (!nr--) break; if (can_free_glock(gl)) list_move(&gl->gl_lru, &dispose); } if (!list_empty(&dispose)) freed = gfs2_dispose_glock_lru(&dispose); spin_unlock(&lru_lock); return freed; } static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; return gfs2_scan_glock_lru(sc->nr_to_scan); } static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio(atomic_read(&lru_count)); } static struct shrinker *glock_shrinker; /** * glock_hash_walk - Call a function for glock in a hash bucket * @examiner: the function * @sdp: the filesystem * * Note that the function can be called multiple times on the same * object. So the user must ensure that the function can cope with * that. */ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; struct rhashtable_iter iter; rhashtable_walk_enter(&gl_hash_table, &iter); do { rhashtable_walk_start(&iter); while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) { if (gl->gl_name.ln_sbd == sdp) examiner(gl); } rhashtable_walk_stop(&iter); } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); rhashtable_walk_exit(&iter); } void gfs2_cancel_delete_work(struct gfs2_glock *gl) { clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags); clear_bit(GLF_VERIFY_DELETE, &gl->gl_flags); if (cancel_delayed_work(&gl->gl_delete)) gfs2_glock_put(gl); } static void flush_delete_work(struct gfs2_glock *gl) { if (gl->gl_name.ln_type == LM_TYPE_IOPEN) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (cancel_delayed_work(&gl->gl_delete)) { queue_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, 0); } } } void gfs2_flush_delete_work(struct gfs2_sbd *sdp) { glock_hash_walk(flush_delete_work, sdp); flush_workqueue(sdp->sd_delete_wq); } /** * thaw_glock - thaw out a glock which has an unprocessed reply waiting * @gl: The glock to thaw * */ static void thaw_glock(struct gfs2_glock *gl) { if (!test_and_clear_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags)) return; if (!lockref_get_not_dead(&gl->gl_lockref)) return; gfs2_glock_remove_from_lru(gl); spin_lock(&gl->gl_lockref.lock); set_bit(GLF_HAVE_REPLY, &gl->gl_flags); gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); } /** * clear_glock - look at a glock and see if we can free it from glock cache * @gl: the glock to look at * */ static void clear_glock(struct gfs2_glock *gl) { gfs2_glock_remove_from_lru(gl); spin_lock(&gl->gl_lockref.lock); if (!__lockref_is_dead(&gl->gl_lockref)) { gl->gl_lockref.count++; if (gl->gl_state != LM_ST_UNLOCKED) request_demote(gl, LM_ST_UNLOCKED, 0, false); gfs2_glock_queue_work(gl, 0); } spin_unlock(&gl->gl_lockref.lock); } /** * gfs2_glock_thaw - Thaw any frozen glocks * @sdp: The super block * */ void gfs2_glock_thaw(struct gfs2_sbd *sdp) { glock_hash_walk(thaw_glock, sdp); } static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) { spin_lock(&gl->gl_lockref.lock); gfs2_dump_glock(seq, gl, fsid); spin_unlock(&gl->gl_lockref.lock); } static void dump_glock_func(struct gfs2_glock *gl) { dump_glock(NULL, gl, true); } static void withdraw_glock(struct gfs2_glock *gl) { spin_lock(&gl->gl_lockref.lock); if (!__lockref_is_dead(&gl->gl_lockref)) { /* * We don't want to write back any more dirty data. Unlock the * remaining inode and resource group glocks; this will cause * their ->go_inval() hooks to toss out all the remaining * cached data, dirty or not. */ if (gl->gl_ops->go_inval && gl->gl_state != LM_ST_UNLOCKED) request_demote(gl, LM_ST_UNLOCKED, 0, false); do_error(gl, LM_OUT_ERROR); /* remove pending waiters */ } spin_unlock(&gl->gl_lockref.lock); } void gfs2_withdraw_glocks(struct gfs2_sbd *sdp) { glock_hash_walk(withdraw_glock, sdp); } /** * gfs2_gl_hash_clear - Empty out the glock hash table * @sdp: the filesystem * * Called when unmounting the filesystem. */ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { unsigned long start = jiffies; bool timed_out = false; set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); flush_workqueue(sdp->sd_glock_wq); glock_hash_walk(clear_glock, sdp); flush_workqueue(sdp->sd_glock_wq); while (!timed_out) { wait_event_timeout(sdp->sd_kill_wait, !atomic_read(&sdp->sd_glock_disposal), HZ * 60); if (!atomic_read(&sdp->sd_glock_disposal)) break; timed_out = time_after(jiffies, start + (HZ * 600)); fs_warn(sdp, "%u glocks left after %u seconds%s\n", atomic_read(&sdp->sd_glock_disposal), jiffies_to_msecs(jiffies - start) / 1000, timed_out ? ":" : "; still waiting"); } gfs2_lm_unmount(sdp); gfs2_free_dead_glocks(sdp); glock_hash_walk(dump_glock_func, sdp); destroy_workqueue(sdp->sd_glock_wq); sdp->sd_glock_wq = NULL; } static const char *state2str(unsigned state) { switch(state) { case LM_ST_UNLOCKED: return "UN"; case LM_ST_SHARED: return "SH"; case LM_ST_DEFERRED: return "DF"; case LM_ST_EXCLUSIVE: return "EX"; } return "??"; } static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) { char *p = buf; if (flags & LM_FLAG_TRY) *p++ = 't'; if (flags & LM_FLAG_TRY_1CB) *p++ = 'T'; if (flags & LM_FLAG_RECOVER) *p++ = 'e'; if (flags & LM_FLAG_ANY) *p++ = 'A'; if (flags & LM_FLAG_NODE_SCOPE) *p++ = 'n'; if (flags & GL_ASYNC) *p++ = 'a'; if (flags & GL_EXACT) *p++ = 'E'; if (flags & GL_NOCACHE) *p++ = 'c'; if (test_bit(HIF_HOLDER, &iflags)) *p++ = 'H'; if (test_bit(HIF_WAIT, &iflags)) *p++ = 'W'; if (flags & GL_SKIP) *p++ = 's'; *p = 0; return buf; } /** * dump_holder - print information about a glock holder * @seq: the seq_file struct * @gh: the glock holder * @fs_id_buf: pointer to file system id (if requested) * */ static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh, const char *fs_id_buf) { const char *comm = "(none)"; pid_t owner_pid = 0; char flags_buf[32]; rcu_read_lock(); if (pid_is_meaningful(gh)) { struct task_struct *gh_owner; comm = "(ended)"; owner_pid = pid_nr(gh->gh_owner_pid); gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); if (gh_owner) comm = gh_owner->comm; } gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n", fs_id_buf, state2str(gh->gh_state), hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip); rcu_read_unlock(); } static const char *gflags2str(char *buf, const struct gfs2_glock *gl) { const unsigned long *gflags = &gl->gl_flags; char *p = buf; if (test_bit(GLF_LOCK, gflags)) *p++ = 'l'; if (test_bit(GLF_DEMOTE, gflags)) *p++ = 'D'; if (test_bit(GLF_PENDING_DEMOTE, gflags)) *p++ = 'd'; if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags)) *p++ = 'p'; if (test_bit(GLF_DIRTY, gflags)) *p++ = 'y'; if (test_bit(GLF_LFLUSH, gflags)) *p++ = 'f'; if (test_bit(GLF_PENDING_REPLY, gflags)) *p++ = 'R'; if (test_bit(GLF_HAVE_REPLY, gflags)) *p++ = 'r'; if (test_bit(GLF_INITIAL, gflags)) *p++ = 'a'; if (test_bit(GLF_HAVE_FROZEN_REPLY, gflags)) *p++ = 'F'; if (!list_empty(&gl->gl_holders)) *p++ = 'q'; if (test_bit(GLF_LRU, gflags)) *p++ = 'L'; if (gl->gl_object) *p++ = 'o'; if (test_bit(GLF_BLOCKING, gflags)) *p++ = 'b'; if (test_bit(GLF_INSTANTIATE_NEEDED, gflags)) *p++ = 'n'; if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags)) *p++ = 'N'; if (test_bit(GLF_TRY_TO_EVICT, gflags)) *p++ = 'e'; if (test_bit(GLF_VERIFY_DELETE, gflags)) *p++ = 'E'; if (test_bit(GLF_DEFER_DELETE, gflags)) *p++ = 's'; if (test_bit(GLF_CANCELING, gflags)) *p++ = 'C'; *p = 0; return buf; } /** * gfs2_dump_glock - print information about a glock * @seq: The seq_file struct * @gl: the glock * @fsid: If true, also dump the file system id * * The file format is as follows: * One line per object, capital letters are used to indicate objects * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented, * other objects are indented by a single space and follow the glock to * which they are related. Fields are indicated by lower case letters * followed by a colon and the field value, except for strings which are in * [] so that its possible to see if they are composed of spaces for * example. The field's are n = number (id of the object), f = flags, * t = type, s = state, r = refcount, e = error, p = pid. * */ void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; const struct gfs2_holder *gh; char gflags_buf[32]; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; unsigned long nrpages = 0; if (gl->gl_ops->go_flags & GLOF_ASPACE) { struct address_space *mapping = gfs2_glock2aspace(gl); nrpages = mapping->nrpages; } memset(fs_id_buf, 0, sizeof(fs_id_buf)); if (fsid && sdp) /* safety precaution */ sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); dtime = jiffies - gl->gl_demote_time; dtime *= 1000000/HZ; /* demote time in uSec */ if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) dtime = 0; gfs2_print_dbg(seq, "%sG: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d " "v:%d r:%d m:%ld p:%lu\n", fs_id_buf, state2str(gl->gl_state), gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, gflags2str(gflags_buf, gl), state2str(gl->gl_target), state2str(gl->gl_demote_state), dtime, atomic_read(&gl->gl_ail_count), atomic_read(&gl->gl_revokes), (int)gl->gl_lockref.count, gl->gl_hold_time, nrpages); list_for_each_entry(gh, &gl->gl_holders, gh_list) dump_holder(seq, gh, fs_id_buf); if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) glops->go_dump(seq, gl, fs_id_buf); } static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_glock *gl = iter_ptr; seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]); return 0; } static const char *gfs2_gltype[] = { "type", "reserved", "nondisk", "inode", "rgrp", "meta", "iopen", "flock", "plock", "quota", "journal", }; static const char *gfs2_stype[] = { [GFS2_LKS_SRTT] = "srtt", [GFS2_LKS_SRTTVAR] = "srttvar", [GFS2_LKS_SRTTB] = "srttb", [GFS2_LKS_SRTTVARB] = "srttvarb", [GFS2_LKS_SIRT] = "sirt", [GFS2_LKS_SIRTVAR] = "sirtvar", [GFS2_LKS_DCOUNT] = "dlm", [GFS2_LKS_QCOUNT] = "queue", }; #define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype)) static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_sbd *sdp = seq->private; loff_t pos = *(loff_t *)iter_ptr; unsigned index = pos >> 3; unsigned subindex = pos & 0x07; int i; if (index == 0 && subindex != 0) return 0; seq_printf(seq, "%-10s %8s:", gfs2_gltype[index], (index == 0) ? "cpu": gfs2_stype[subindex]); for_each_possible_cpu(i) { const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i); if (index == 0) seq_printf(seq, " %15u", i); else seq_printf(seq, " %15llu", (unsigned long long)lkstats-> lkstats[index - 1].stats[subindex]); } seq_putc(seq, '\n'); return 0; } int __init gfs2_glock_init(void) { int i, ret; ret = rhashtable_init(&gl_hash_table, &ht_parms); if (ret < 0) return ret; glock_shrinker = shrinker_alloc(0, "gfs2-glock"); if (!glock_shrinker) { rhashtable_destroy(&gl_hash_table); return -ENOMEM; } glock_shrinker->count_objects = gfs2_glock_shrink_count; glock_shrinker->scan_objects = gfs2_glock_shrink_scan; shrinker_register(glock_shrinker); for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) init_waitqueue_head(glock_wait_table + i); return 0; } void gfs2_glock_exit(void) { shrinker_free(glock_shrinker); rhashtable_destroy(&gl_hash_table); } static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n) { struct gfs2_glock *gl = gi->gl; if (gl) { if (n == 0) return; gfs2_glock_put_async(gl); } for (;;) { gl = rhashtable_walk_next(&gi->hti); if (IS_ERR_OR_NULL(gl)) { if (gl == ERR_PTR(-EAGAIN)) { n = 1; continue; } gl = NULL; break; } if (gl->gl_name.ln_sbd != gi->sdp) continue; if (n <= 1) { if (!lockref_get_not_dead(&gl->gl_lockref)) continue; break; } else { if (__lockref_is_dead(&gl->gl_lockref)) continue; n--; } } gi->gl = gl; } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct gfs2_glock_iter *gi = seq->private; loff_t n; /* * We can either stay where we are, skip to the next hash table * entry, or start from the beginning. */ if (*pos < gi->last_pos) { rhashtable_walk_exit(&gi->hti); rhashtable_walk_enter(&gl_hash_table, &gi->hti); n = *pos + 1; } else { n = *pos - gi->last_pos; } rhashtable_walk_start(&gi->hti); gfs2_glock_iter_next(gi, n); gi->last_pos = *pos; return gi->gl; } static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; (*pos)++; gi->last_pos = *pos; gfs2_glock_iter_next(gi, 1); return gi->gl; } static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) __releases(RCU) { struct gfs2_glock_iter *gi = seq->private; rhashtable_walk_stop(&gi->hti); } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) { dump_glock(seq, iter_ptr, false); return 0; } static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) { preempt_disable(); if (*pos >= GFS2_NR_SBSTATS) return NULL; return pos; } static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { (*pos)++; if (*pos >= GFS2_NR_SBSTATS) return NULL; return pos; } static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr) { preempt_enable(); } static const struct seq_operations gfs2_glock_seq_ops = { .start = gfs2_glock_seq_start, .next = gfs2_glock_seq_next, .stop = gfs2_glock_seq_stop, .show = gfs2_glock_seq_show, }; static const struct seq_operations gfs2_glstats_seq_ops = { .start = gfs2_glock_seq_start, .next = gfs2_glock_seq_next, .stop = gfs2_glock_seq_stop, .show = gfs2_glstats_seq_show, }; static const struct seq_operations gfs2_sbstats_sops = { .start = gfs2_sbstats_seq_start, .next = gfs2_sbstats_seq_next, .stop = gfs2_sbstats_seq_stop, .show = gfs2_sbstats_seq_show, }; #define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL) static int __gfs2_glocks_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter)); if (ret == 0) { struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; /* * Initially, we are "before" the first hash table entry; the * first call to rhashtable_walk_next gets us the first entry. */ gi->last_pos = -1; gi->gl = NULL; rhashtable_walk_enter(&gl_hash_table, &gi->hti); } return ret; } static int gfs2_glocks_open(struct inode *inode, struct file *file) { return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops); } static int gfs2_glocks_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; if (gi->gl) gfs2_glock_put(gi->gl); rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } static int gfs2_glstats_open(struct inode *inode, struct file *file) { return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops); } static const struct file_operations gfs2_glocks_fops = { .owner = THIS_MODULE, .open = gfs2_glocks_open, .read = seq_read, .llseek = seq_lseek, .release = gfs2_glocks_release, }; static const struct file_operations gfs2_glstats_fops = { .owner = THIS_MODULE, .open = gfs2_glstats_open, .read = seq_read, .llseek = seq_lseek, .release = gfs2_glocks_release, }; struct gfs2_glockfd_iter { struct super_block *sb; unsigned int tgid; struct task_struct *task; unsigned int fd; struct file *file; }; static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i) { struct pid_namespace *ns = task_active_pid_ns(current); struct pid *pid; if (i->task) put_task_struct(i->task); rcu_read_lock(); retry: i->task = NULL; pid = find_ge_pid(i->tgid, ns); if (pid) { i->tgid = pid_nr_ns(pid, ns); i->task = pid_task(pid, PIDTYPE_TGID); if (!i->task) { i->tgid++; goto retry; } get_task_struct(i->task); } rcu_read_unlock(); return i->task; } static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) { if (i->file) { fput(i->file); i->file = NULL; } for(;; i->fd++) { i->file = fget_task_next(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } if (file_inode(i->file)->i_sb == i->sb) break; fput(i->file); } return i->file; } static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glockfd_iter *i = seq->private; if (*pos) return NULL; while (gfs2_glockfd_next_task(i)) { if (gfs2_glockfd_next_file(i)) return i; i->tgid++; } return NULL; } static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { struct gfs2_glockfd_iter *i = seq->private; (*pos)++; i->fd++; do { if (gfs2_glockfd_next_file(i)) return i; i->tgid++; } while (gfs2_glockfd_next_task(i)); return NULL; } static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr) { struct gfs2_glockfd_iter *i = seq->private; if (i->file) fput(i->file); if (i->task) put_task_struct(i->task); } static void gfs2_glockfd_seq_show_flock(struct seq_file *seq, struct gfs2_glockfd_iter *i) { struct gfs2_file *fp = i->file->private_data; struct gfs2_holder *fl_gh = &fp->f_fl_gh; struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED }; if (!READ_ONCE(fl_gh->gh_gl)) return; spin_lock(&i->file->f_lock); if (gfs2_holder_initialized(fl_gh)) gl_name = fl_gh->gh_gl->gl_name; spin_unlock(&i->file->f_lock); if (gl_name.ln_type != LM_TYPE_RESERVED) { seq_printf(seq, "%d %u %u/%llx\n", i->tgid, i->fd, gl_name.ln_type, (unsigned long long)gl_name.ln_number); } } static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_glockfd_iter *i = seq->private; struct inode *inode = file_inode(i->file); struct gfs2_glock *gl; inode_lock_shared(inode); gl = GFS2_I(inode)->i_iopen_gh.gh_gl; if (gl) { seq_printf(seq, "%d %u %u/%llx\n", i->tgid, i->fd, gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); } gfs2_glockfd_seq_show_flock(seq, i); inode_unlock_shared(inode); return 0; } static const struct seq_operations gfs2_glockfd_seq_ops = { .start = gfs2_glockfd_seq_start, .next = gfs2_glockfd_seq_next, .stop = gfs2_glockfd_seq_stop, .show = gfs2_glockfd_seq_show, }; static int gfs2_glockfd_open(struct inode *inode, struct file *file) { struct gfs2_glockfd_iter *i; struct gfs2_sbd *sdp = inode->i_private; i = __seq_open_private(file, &gfs2_glockfd_seq_ops, sizeof(struct gfs2_glockfd_iter)); if (!i) return -ENOMEM; i->sb = sdp->sd_vfs; return 0; } static const struct file_operations gfs2_glockfd_fops = { .owner = THIS_MODULE, .open = gfs2_glockfd_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) { sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root); debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glocks_fops); debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glockfd_fops); debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glstats_fops); debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_sbstats_fops); } void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) { debugfs_remove_recursive(sdp->debugfs_dir); sdp->debugfs_dir = NULL; } void gfs2_register_debugfs(void) { gfs2_root = debugfs_create_dir("gfs2", NULL); } void gfs2_unregister_debugfs(void) { debugfs_remove(gfs2_root); gfs2_root = NULL; } |
| 2 2 2 2 3 2 2 2 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/export.h> #include "rds.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); EXPORT_PER_CPU_SYMBOL_GPL(rds_stats); /* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */ static const char *const rds_stat_names[] = { "conn_reset", "recv_drop_bad_checksum", "recv_drop_old_seq", "recv_drop_no_sock", "recv_drop_dead_sock", "recv_deliver_raced", "recv_delivered", "recv_queued", "recv_immediate_retry", "recv_delayed_retry", "recv_ack_required", "recv_rdma_bytes", "recv_ping", "send_queue_empty", "send_queue_full", "send_lock_contention", "send_lock_queue_raced", "send_immediate_retry", "send_delayed_retry", "send_drop_acked", "send_ack_required", "send_queued", "send_rdma", "send_rdma_bytes", "send_pong", "page_remainder_hit", "page_remainder_miss", "copy_to_user", "copy_from_user", "cong_update_queued", "cong_update_received", "cong_send_error", "cong_send_blocked", "recv_bytes_added_to_sock", "recv_bytes_freed_fromsock", "send_stuck_rm", }; void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr) { struct rds_info_counter ctr; size_t i; for (i = 0; i < nr; i++) { BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); strscpy_pad(ctr.name, names[i]); ctr.value = values[i]; rds_info_copy(iter, &ctr, sizeof(ctr)); } } EXPORT_SYMBOL_GPL(rds_stats_info_copy); /* * This gives global counters across all the transports. The strings * are copied in so that the tool doesn't need knowledge of the specific * stats that we're exporting. Some are pretty implementation dependent * and may change over time. That doesn't stop them from being useful. * * This is the only function in the chain that knows about the byte granular * length in userspace. It converts it to number of stat entries that the * rest of the functions operate in. */ static void rds_stats_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_statistics stats = {0, }; uint64_t *src; uint64_t *sum; size_t i; int cpu; unsigned int avail; avail = len / sizeof(struct rds_info_counter); if (avail < ARRAY_SIZE(rds_stat_names)) { avail = 0; goto trans; } for_each_online_cpu(cpu) { src = (uint64_t *)&(per_cpu(rds_stats, cpu)); sum = (uint64_t *)&stats; for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) *(sum++) += *(src++); } rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names, ARRAY_SIZE(rds_stat_names)); avail -= ARRAY_SIZE(rds_stat_names); trans: lens->each = sizeof(struct rds_info_counter); lens->nr = rds_trans_stats_info_copy(iter, avail) + ARRAY_SIZE(rds_stat_names); } void rds_stats_exit(void) { rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); } int rds_stats_init(void) { rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); return 0; } |
| 1499 1500 1498 1503 12 12 12 12 12 12 12 12 8373 3227 17804 17804 1246 1246 1246 196 196 2767 5607 11 186 14984 10257 186 629 307 324 323 3360 3363 3359 1 1 7 762 761 704 8 7 5 1 6 177 177 176 177 177 177 177 177 189 123 123 123 554 123 556 189 557 200 557 557 557 557 557 557 557 557 629 628 629 629 626 1 1 1 1 628 85 557 23 556 557 556 539 530 530 531 26 557 545 545 12 555 557 555 3 3 3 3 3 3 553 554 553 553 553 554 555 628 629 7 8 7 7 7 7 7 3 1 1 2 7 7 8 7 44 44 43 44 43 44 44 43 44 88 566 439 634 2374 746 1972 980 566 185 439 81 748 634 634 1821 3227 1 3226 240 99 3225 18 3212 349 351 1 1 1 350 145 145 350 212 211 7135 7126 4821 4821 4834 4827 558 559 557 4823 212 212 210 212 308 643 3832 3238 179 3241 3225 28 10 3235 3230 1 3229 145 145 3224 1712 1 4820 4821 7128 10 10 10 78 16474 15423 2309 16473 16467 15939 7742 7354 7361 7376 481 434 711 709 1 712 1062 1340 1340 1344 3227 3227 3244 3163 864 863 864 835 78 848 847 865 827 829 828 829 827 829 210 210 210 210 210 210 210 200 200 200 200 200 199 200 200 53 29 200 199 198 66 66 66 1105 5561 2010 5183 3210 5199 231 5008 3513 5553 1008 5547 279 9 3734 2317 249 2094 3703 3703 3699 3695 24 40 40 10 9 9 9 9 9 70 35 35 8 17 16463 46 23 25 5 4 16 28 17 1364 1 1 1 129 129 98 2 2 128 129 129 129 129 2 129 129 129 129 2 129 2 2 1 2 2 801 764 754 14 754 764 764 762 6 5 4 3 5 6 762 619 760 760 762 620 759 5 5 762 64 64 64 505 223 1105 1105 1066 36 36 47 13 13 1 1122 1125 7610 7594 3365 3371 3364 8 8 8 8 8 8 45 45 45 45 726 704 725 726 727 727 724 727 726 726 703 703 726 512 512 511 41 471 471 471 471 512 471 495 53 135 61 321 50 88 200 36 36 1314 1317 1317 1319 613 612 613 40 5588 5559 40 5548 5550 5534 3759 3797 2583 3799 5607 5592 5606 5597 5607 5607 27 3797 3801 3806 3804 5598 612 435 5 434 226 234 234 233 435 172 173 212 7 206 227 211 19 193 19 19 18 18 18 211 18 14 4 2 4 4 12 16 15 15 15 43 12 113 178 179 10 10 10 10 222 223 69 69 109 109 109 1 110 24 39 107 19 107 1 107 86 107 198 199 196 194 190 6 195 196 194 199 125 7 7 4 4 4 7 4 4 66 67 53 67 46 45 46 46 46 46 46 44 44 44 3582 3582 2003 3 1 2018 6318 6311 6308 6300 6354 6310 4946 2018 7940 8767 8779 7718 8809 8983 161 22 33 21 3 3 8773 8985 1504 299 1507 151 179 1231 846 1507 150 48 905 1599 1598 1369 871 1370 1371 315 315 2 1 314 5 1 11 3 5 315 93 278 35 6 52 61 275 55 82 19 82 29 29 3700 3706 3702 3706 1036 1163 4 531 32 54 32 32 32 32 26 6 32 54 9 6 39 54 10 4 4377 20 29 20 10 10 10 46 50 5 46 78 9 26 65 60 67 67 67 789 789 789 19 236 237 237 236 236 106 194 106 105 106 58 58 58 58 32 32 32 32 233 233 233 231 230 69 69 66 36 36 36 79 79 79 184 1 455 237 236 26 234 233 429 1 454 454 454 199 199 199 199 89 70 1 17887 17807 17887 17804 17808 2375 2372 1967 1 1967 2232 2232 2235 2234 1 2234 1819 2219 167 2235 244 978 329 981 977 335 4 331 152 151 10 1 3 9 149 188 187 263 240 262 262 327 327 185 329 327 1196 1194 1192 1197 565 256 565 254 321 320 4 559 559 874 878 181 227 225 227 227 226 227 227 12257 1743 63 35 3352 3378 1314 172 13 13 13 13 142 55 55 53 54 54 11297 11241 11300 156 215 215 215 195 55 55 2 54 54 54 2 54 3 2 2 2 1 2 3 9700 4 9693 9697 9691 379 379 4 381 2800 1025 106 1168 1168 1166 148 148 125 61 148 92 82 92 32 32 92 92 91 79 92 90 90 90 90 90 2 2 2 352 352 352 352 352 351 349 49 328 329 350 349 44 44 44 169 66 66 66 66 53 18 54 67 142 143 4 154 12 2 11 11 11 4 4 4 4 4 1373 429 412 398 415 1346 1348 1356 1356 1256 414 414 2 414 49 48 398 15 15 2 13 15 15 385 414 413 411 413 1 412 412 412 9324 9351 9320 9284 4682 4681 4683 4792 21 21 5 4783 9291 1181 214 164 50 20 1 8 27 8 20 18 18 17 18 18 18 20 20 20 43 1 55 14 42 35 35 29 4 72 5 67 68 33 34 164 163 165 6975 6975 6974 18 18 18 3 18 7 2 1 3 1 3 17 8 9 9 1 18 19 19 1 5 4 2 6 18 17 12 2 12 6 4 2 2 2 3 9 15 15 1 14 4 10 4 2 4 6 1 5 4 4 1 1 1 11 5 14 14 15 5 3 3 3 14 14 50 30 223 243 24 24 213 7 229 231 229 231 291 294 553 553 554 4 4 4 13 3978 460 1577 3980 238 3244 18 18 18 3233 3 3 226 226 953 224 1475 14 5 48 24 24 24 457 73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 | // SPDX-License-Identifier: GPL-2.0-only /* * Security-Enhanced Linux (SELinux) security module * * This file contains the SELinux hook function implementations. * * Authors: Stephen Smalley, <stephen.smalley.work@gmail.com> * Chris Vance, <cvance@nai.com> * Wayne Salamon, <wsalamon@nai.com> * James Morris <jmorris@redhat.com> * * Copyright (C) 2001,2002 Networks Associates Technology, Inc. * Copyright (C) 2003-2008 Red Hat, Inc., James Morris <jmorris@redhat.com> * Eric Paris <eparis@redhat.com> * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * <dgoeddel@trustedcs.com> * Copyright (C) 2006, 2007, 2009 Hewlett-Packard Development Company, L.P. * Paul Moore <paul@paul-moore.com> * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. * Yuichi Nakamura <ynakam@hitachisoft.jp> * Copyright (C) 2016 Mellanox Technologies */ #include <linux/init.h> #include <linux/kd.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/errno.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/lsm_hooks.h> #include <linux/xattr.h> #include <linux/capability.h> #include <linux/unistd.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/swap.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/dcache.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/tty.h> #include <net/icmp.h> #include <net/ip.h> /* for local_port_range[] */ #include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */ #include <net/inet_connection_sock.h> #include <net/net_namespace.h> #include <net/netlabel.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/interrupt.h> #include <linux/netdevice.h> /* for network interface checks */ #include <net/netlink.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/sctp.h> #include <net/sctp/structs.h> #include <linux/quota.h> #include <linux/un.h> /* for Unix socket types */ #include <net/af_unix.h> /* for Unix socket types */ #include <linux/parser.h> #include <linux/nfs_mount.h> #include <net/ipv6.h> #include <linux/hugetlb.h> #include <linux/personality.h> #include <linux/audit.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/posix-timers.h> #include <linux/syslog.h> #include <linux/user_namespace.h> #include <linux/export.h> #include <linux/msg.h> #include <linux/shm.h> #include <uapi/linux/shm.h> #include <linux/bpf.h> #include <linux/kernfs.h> #include <linux/stringhash.h> /* for hashlen_string() */ #include <uapi/linux/mount.h> #include <linux/fsnotify.h> #include <linux/fanotify.h> #include <linux/io_uring/cmd.h> #include <uapi/linux/lsm.h> #include <linux/memfd.h> #include "initcalls.h" #include "avc.h" #include "objsec.h" #include "netif.h" #include "netnode.h" #include "netport.h" #include "ibpkey.h" #include "xfrm.h" #include "netlabel.h" #include "audit.h" #include "avc_ss.h" #define SELINUX_INODE_INIT_XATTRS 1 struct selinux_state selinux_state; /* SECMARK reference count */ static atomic_t selinux_secmark_refcount = ATOMIC_INIT(0); #ifdef CONFIG_SECURITY_SELINUX_DEVELOP static int selinux_enforcing_boot __initdata; static int __init enforcing_setup(char *str) { unsigned long enforcing; if (!kstrtoul(str, 0, &enforcing)) selinux_enforcing_boot = enforcing ? 1 : 0; return 1; } __setup("enforcing=", enforcing_setup); #else #define selinux_enforcing_boot 1 #endif int selinux_enabled_boot __initdata = 1; #ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM static int __init selinux_enabled_setup(char *str) { unsigned long enabled; if (!kstrtoul(str, 0, &enabled)) selinux_enabled_boot = enabled ? 1 : 0; return 1; } __setup("selinux=", selinux_enabled_setup); #endif static int __init checkreqprot_setup(char *str) { unsigned long checkreqprot; if (!kstrtoul(str, 0, &checkreqprot)) { if (checkreqprot) pr_err("SELinux: checkreqprot set to 1 via kernel parameter. This is no longer supported.\n"); } return 1; } __setup("checkreqprot=", checkreqprot_setup); /** * selinux_secmark_enabled - Check to see if SECMARK is currently enabled * * Description: * This function checks the SECMARK reference counter to see if any SECMARK * targets are currently configured, if the reference counter is greater than * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is * enabled, false (0) if SECMARK is disabled. If the always_check_network * policy capability is enabled, SECMARK is always considered enabled. * */ static int selinux_secmark_enabled(void) { return (selinux_policycap_alwaysnetwork() || atomic_read(&selinux_secmark_refcount)); } /** * selinux_peerlbl_enabled - Check to see if peer labeling is currently enabled * * Description: * This function checks if NetLabel or labeled IPSEC is enabled. Returns true * (1) if any are enabled or false (0) if neither are enabled. If the * always_check_network policy capability is enabled, peer labeling * is always considered enabled. * */ static int selinux_peerlbl_enabled(void) { return (selinux_policycap_alwaysnetwork() || netlbl_enabled() || selinux_xfrm_enabled()); } static int selinux_netcache_avc_callback(u32 event) { if (event == AVC_CALLBACK_RESET) { sel_netif_flush(); sel_netnode_flush(); sel_netport_flush(); synchronize_net(); } return 0; } static int selinux_lsm_notifier_avc_callback(u32 event) { if (event == AVC_CALLBACK_RESET) { sel_ib_pkey_flush(); call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL); } return 0; } /* * initialise the security for the init task */ static void cred_init_security(void) { struct cred_security_struct *crsec; /* NOTE: the lsm framework zeros out the buffer on allocation */ crsec = selinux_cred(unrcu_pointer(current->real_cred)); crsec->osid = crsec->sid = SECINITSID_KERNEL; } /* * get the security ID of a set of credentials */ static inline u32 cred_sid(const struct cred *cred) { const struct cred_security_struct *crsec; crsec = selinux_cred(cred); return crsec->sid; } static void __ad_net_init(struct common_audit_data *ad, struct lsm_network_audit *net, int ifindex, struct sock *sk, u16 family) { ad->type = LSM_AUDIT_DATA_NET; ad->u.net = net; net->netif = ifindex; net->sk = sk; net->family = family; } static void ad_net_init_from_sk(struct common_audit_data *ad, struct lsm_network_audit *net, struct sock *sk) { __ad_net_init(ad, net, 0, sk, 0); } static void ad_net_init_from_iif(struct common_audit_data *ad, struct lsm_network_audit *net, int ifindex, u16 family) { __ad_net_init(ad, net, ifindex, NULL, family); } /* * get the objective security ID of a task */ static inline u32 task_sid_obj(const struct task_struct *task) { u32 sid; rcu_read_lock(); sid = cred_sid(__task_cred(task)); rcu_read_unlock(); return sid; } static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry); /* * Try reloading inode security labels that have been marked as invalid. The * @may_sleep parameter indicates when sleeping and thus reloading labels is * allowed; when set to false, returns -ECHILD when the label is * invalid. The @dentry parameter should be set to a dentry of the inode. */ static int __inode_security_revalidate(struct inode *inode, struct dentry *dentry, bool may_sleep) { if (!selinux_initialized()) return 0; if (may_sleep) might_sleep(); else return -ECHILD; /* * Check to ensure that an inode's SELinux state is valid and try * reloading the inode security label if necessary. This will fail if * @dentry is NULL and no dentry for this inode can be found; in that * case, continue using the old label. */ inode_doinit_with_dentry(inode, dentry); return 0; } static struct inode_security_struct *inode_security_novalidate(struct inode *inode) { return selinux_inode(inode); } static inline struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu) { int rc; struct inode_security_struct *isec = selinux_inode(inode); /* check below is racy, but revalidate will recheck with lock held */ if (data_race(likely(isec->initialized == LABEL_INITIALIZED))) return isec; rc = __inode_security_revalidate(inode, NULL, !rcu); if (rc) return ERR_PTR(rc); return isec; } /* * Get the security label of an inode. */ static inline struct inode_security_struct *inode_security(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); /* check below is racy, but revalidate will recheck with lock held */ if (data_race(likely(isec->initialized == LABEL_INITIALIZED))) return isec; __inode_security_revalidate(inode, NULL, true); return isec; } static inline struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry) { return selinux_inode(d_backing_inode(dentry)); } /* * Get the security label of a dentry's backing inode. */ static inline struct inode_security_struct *backing_inode_security(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec = selinux_inode(inode); /* check below is racy, but revalidate will recheck with lock held */ if (data_race(likely(isec->initialized == LABEL_INITIALIZED))) return isec; __inode_security_revalidate(inode, dentry, true); return isec; } static void inode_free_security(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); struct superblock_security_struct *sbsec; if (!isec) return; sbsec = selinux_superblock(inode->i_sb); /* * As not all inode security structures are in a list, we check for * empty list outside of the lock to make sure that we won't waste * time taking a lock doing nothing. * * The list_del_init() function can be safely called more than once. * It should not be possible for this function to be called with * concurrent list_add(), but for better safety against future changes * in the code, we use list_empty_careful() here. */ if (!list_empty_careful(&isec->list)) { spin_lock(&sbsec->isec_lock); list_del_init(&isec->list); spin_unlock(&sbsec->isec_lock); } } struct selinux_mnt_opts { u32 fscontext_sid; u32 context_sid; u32 rootcontext_sid; u32 defcontext_sid; }; static void selinux_free_mnt_opts(void *mnt_opts) { kfree(mnt_opts); } enum { Opt_error = -1, Opt_context = 0, Opt_defcontext = 1, Opt_fscontext = 2, Opt_rootcontext = 3, Opt_seclabel = 4, }; #define A(s, has_arg) {#s, sizeof(#s) - 1, Opt_##s, has_arg} static const struct { const char *name; int len; int opt; bool has_arg; } tokens[] = { A(context, true), A(fscontext, true), A(defcontext, true), A(rootcontext, true), A(seclabel, false), }; #undef A static int match_opt_prefix(char *s, int l, char **arg) { unsigned int i; for (i = 0; i < ARRAY_SIZE(tokens); i++) { size_t len = tokens[i].len; if (len > l || memcmp(s, tokens[i].name, len)) continue; if (tokens[i].has_arg) { if (len == l || s[len] != '=') continue; *arg = s + len + 1; } else if (len != l) continue; return tokens[i].opt; } return Opt_error; } #define SEL_MOUNT_FAIL_MSG "SELinux: duplicate or incompatible mount options\n" static int may_context_mount_sb_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { const struct cred_security_struct *crsec = selinux_cred(cred); int rc; rc = avc_has_perm(crsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; rc = avc_has_perm(crsec->sid, sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELTO, NULL); return rc; } static int may_context_mount_inode_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { const struct cred_security_struct *crsec = selinux_cred(cred); int rc; rc = avc_has_perm(crsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; rc = avc_has_perm(sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, NULL); return rc; } static int selinux_is_genfs_special_handling(struct super_block *sb) { /* Special handling. Genfs but also in-core setxattr handler */ return !strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "pstore") || !strcmp(sb->s_type->name, "debugfs") || !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "rootfs") || (selinux_policycap_cgroupseclabel() && (!strcmp(sb->s_type->name, "cgroup") || !strcmp(sb->s_type->name, "cgroup2"))) || (selinux_policycap_functionfs_seclabel() && !strcmp(sb->s_type->name, "functionfs")); } static int selinux_is_sblabel_mnt(struct super_block *sb) { struct superblock_security_struct *sbsec = selinux_superblock(sb); /* * IMPORTANT: Double-check logic in this function when adding a new * SECURITY_FS_USE_* definition! */ BUILD_BUG_ON(SECURITY_FS_USE_MAX != 7); switch (sbsec->behavior) { case SECURITY_FS_USE_XATTR: case SECURITY_FS_USE_TRANS: case SECURITY_FS_USE_TASK: case SECURITY_FS_USE_NATIVE: return 1; case SECURITY_FS_USE_GENFS: return selinux_is_genfs_special_handling(sb); /* Never allow relabeling on context mounts */ case SECURITY_FS_USE_MNTPOINT: case SECURITY_FS_USE_NONE: default: return 0; } } static int sb_check_xattr_support(struct super_block *sb) { struct superblock_security_struct *sbsec = selinux_superblock(sb); struct dentry *root = sb->s_root; struct inode *root_inode = d_backing_inode(root); u32 sid; int rc; /* * Make sure that the xattr handler exists and that no * error other than -ENODATA is returned by getxattr on * the root directory. -ENODATA is ok, as this may be * the first boot of the SELinux kernel before we have * assigned xattr values to the filesystem. */ if (!(root_inode->i_opflags & IOP_XATTR)) { pr_warn("SELinux: (dev %s, type %s) has no xattr support\n", sb->s_id, sb->s_type->name); goto fallback; } rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0); if (rc < 0 && rc != -ENODATA) { if (rc == -EOPNOTSUPP) { pr_warn("SELinux: (dev %s, type %s) has no security xattr handler\n", sb->s_id, sb->s_type->name); goto fallback; } else { pr_warn("SELinux: (dev %s, type %s) getxattr errno %d\n", sb->s_id, sb->s_type->name, -rc); return rc; } } return 0; fallback: /* No xattr support - try to fallback to genfs if possible. */ rc = security_genfs_sid(sb->s_type->name, "/", SECCLASS_DIR, &sid); if (rc) return -EOPNOTSUPP; pr_warn("SELinux: (dev %s, type %s) falling back to genfs\n", sb->s_id, sb->s_type->name); sbsec->behavior = SECURITY_FS_USE_GENFS; sbsec->sid = sid; return 0; } static int sb_finish_set_opts(struct super_block *sb) { struct superblock_security_struct *sbsec = selinux_superblock(sb); struct dentry *root = sb->s_root; struct inode *root_inode = d_backing_inode(root); int rc = 0; if (sbsec->behavior == SECURITY_FS_USE_XATTR) { rc = sb_check_xattr_support(sb); if (rc) return rc; } sbsec->flags |= SE_SBINITIALIZED; /* * Explicitly set or clear SBLABEL_MNT. It's not sufficient to simply * leave the flag untouched because sb_clone_mnt_opts might be handing * us a superblock that needs the flag to be cleared. */ if (selinux_is_sblabel_mnt(sb)) sbsec->flags |= SBLABEL_MNT; else sbsec->flags &= ~SBLABEL_MNT; /* Initialize the root inode. */ rc = inode_doinit_with_dentry(root_inode, root); /* Initialize any other inodes associated with the superblock, e.g. inodes created prior to initial policy load or inodes created during get_sb by a pseudo filesystem that directly populates itself. */ spin_lock(&sbsec->isec_lock); while (!list_empty(&sbsec->isec_head)) { struct inode_security_struct *isec = list_first_entry(&sbsec->isec_head, struct inode_security_struct, list); struct inode *inode = isec->inode; list_del_init(&isec->list); spin_unlock(&sbsec->isec_lock); inode = igrab(inode); if (inode) { if (!IS_PRIVATE(inode)) inode_doinit_with_dentry(inode, NULL); iput(inode); } spin_lock(&sbsec->isec_lock); } spin_unlock(&sbsec->isec_lock); return rc; } static int bad_option(struct superblock_security_struct *sbsec, char flag, u32 old_sid, u32 new_sid) { char mnt_flags = sbsec->flags & SE_MNTMASK; /* check if the old mount command had the same options */ if (sbsec->flags & SE_SBINITIALIZED) if (!(sbsec->flags & flag) || (old_sid != new_sid)) return 1; /* check if we were passed the same options twice, * aka someone passed context=a,context=b */ if (!(sbsec->flags & SE_SBINITIALIZED)) if (mnt_flags & flag) return 1; return 0; } /* * Allow filesystems with binary mount data to explicitly set mount point * labeling information. */ static int selinux_set_mnt_opts(struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) { const struct cred *cred = current_cred(); struct superblock_security_struct *sbsec = selinux_superblock(sb); struct dentry *root = sb->s_root; struct selinux_mnt_opts *opts = mnt_opts; struct inode_security_struct *root_isec; u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0; u32 defcontext_sid = 0; int rc = 0; /* * Specifying internal flags without providing a place to * place the results is not allowed */ if (kern_flags && !set_kern_flags) return -EINVAL; mutex_lock(&sbsec->lock); if (!selinux_initialized()) { if (!opts) { /* Defer initialization until selinux_complete_init, after the initial policy is loaded and the security server is ready to handle calls. */ if (kern_flags & SECURITY_LSM_NATIVE_LABELS) { sbsec->flags |= SE_SBNATIVE; *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; } goto out; } rc = -EINVAL; pr_warn("SELinux: Unable to set superblock options " "before the security server is initialized\n"); goto out; } /* * Binary mount data FS will come through this function twice. Once * from an explicit call and once from the generic calls from the vfs. * Since the generic VFS calls will not contain any security mount data * we need to skip the double mount verification. * * This does open a hole in which we will not notice if the first * mount using this sb set explicit options and a second mount using * this sb does not set any security options. (The first options * will be used for both mounts) */ if ((sbsec->flags & SE_SBINITIALIZED) && (sb->s_type->fs_flags & FS_BINARY_MOUNTDATA) && !opts) goto out; root_isec = backing_inode_security_novalidate(root); /* * parse the mount options, check if they are valid sids. * also check if someone is trying to mount the same sb more * than once with different security options. */ if (opts) { if (opts->fscontext_sid) { fscontext_sid = opts->fscontext_sid; if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, fscontext_sid)) goto out_double_mount; sbsec->flags |= FSCONTEXT_MNT; } if (opts->context_sid) { context_sid = opts->context_sid; if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, context_sid)) goto out_double_mount; sbsec->flags |= CONTEXT_MNT; } if (opts->rootcontext_sid) { rootcontext_sid = opts->rootcontext_sid; if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, rootcontext_sid)) goto out_double_mount; sbsec->flags |= ROOTCONTEXT_MNT; } if (opts->defcontext_sid) { defcontext_sid = opts->defcontext_sid; if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, defcontext_sid)) goto out_double_mount; sbsec->flags |= DEFCONTEXT_MNT; } } if (sbsec->flags & SE_SBINITIALIZED) { /* previously mounted with options, but not on this attempt? */ if ((sbsec->flags & SE_MNTMASK) && !opts) goto out_double_mount; rc = 0; goto out; } if (strcmp(sb->s_type->name, "proc") == 0) sbsec->flags |= SE_SBPROC | SE_SBGENFS; if (!strcmp(sb->s_type->name, "debugfs") || !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "binder") || !strcmp(sb->s_type->name, "bpf") || !strcmp(sb->s_type->name, "pstore") || !strcmp(sb->s_type->name, "securityfs") || (selinux_policycap_functionfs_seclabel() && !strcmp(sb->s_type->name, "functionfs"))) sbsec->flags |= SE_SBGENFS; if (!strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "cgroup") || !strcmp(sb->s_type->name, "cgroup2")) sbsec->flags |= SE_SBGENFS | SE_SBGENFS_XATTR; if (!sbsec->behavior) { /* * Determine the labeling behavior to use for this * filesystem type. */ rc = security_fs_use(sb); if (rc) { pr_warn("%s: security_fs_use(%s) returned %d\n", __func__, sb->s_type->name, rc); goto out; } } /* * If this is a user namespace mount and the filesystem type is not * explicitly whitelisted, then no contexts are allowed on the command * line and security labels must be ignored. */ if (sb->s_user_ns != &init_user_ns && strcmp(sb->s_type->name, "tmpfs") && strcmp(sb->s_type->name, "ramfs") && strcmp(sb->s_type->name, "devpts") && strcmp(sb->s_type->name, "overlay")) { if (context_sid || fscontext_sid || rootcontext_sid || defcontext_sid) { rc = -EACCES; goto out; } if (sbsec->behavior == SECURITY_FS_USE_XATTR) { sbsec->behavior = SECURITY_FS_USE_MNTPOINT; rc = security_transition_sid(current_sid(), current_sid(), SECCLASS_FILE, NULL, &sbsec->mntpoint_sid); if (rc) goto out; } goto out_set_opts; } /* sets the context of the superblock for the fs being mounted. */ if (fscontext_sid) { rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred); if (rc) goto out; sbsec->sid = fscontext_sid; } /* * Switch to using mount point labeling behavior. * sets the label used on all file below the mountpoint, and will set * the superblock context if not already set. */ if (sbsec->flags & SE_SBNATIVE) { /* * This means we are initializing a superblock that has been * mounted before the SELinux was initialized and the * filesystem requested native labeling. We had already * returned SECURITY_LSM_NATIVE_LABELS in *set_kern_flags * in the original mount attempt, so now we just need to set * the SECURITY_FS_USE_NATIVE behavior. */ sbsec->behavior = SECURITY_FS_USE_NATIVE; } else if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !context_sid) { sbsec->behavior = SECURITY_FS_USE_NATIVE; *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; } if (context_sid) { if (!fscontext_sid) { rc = may_context_mount_sb_relabel(context_sid, sbsec, cred); if (rc) goto out; sbsec->sid = context_sid; } else { rc = may_context_mount_inode_relabel(context_sid, sbsec, cred); if (rc) goto out; } if (!rootcontext_sid) rootcontext_sid = context_sid; sbsec->mntpoint_sid = context_sid; sbsec->behavior = SECURITY_FS_USE_MNTPOINT; } if (rootcontext_sid) { rc = may_context_mount_inode_relabel(rootcontext_sid, sbsec, cred); if (rc) goto out; root_isec->sid = rootcontext_sid; root_isec->initialized = LABEL_INITIALIZED; } if (defcontext_sid) { if (sbsec->behavior != SECURITY_FS_USE_XATTR && sbsec->behavior != SECURITY_FS_USE_NATIVE) { rc = -EINVAL; pr_warn("SELinux: defcontext option is " "invalid for this filesystem type\n"); goto out; } if (defcontext_sid != sbsec->def_sid) { rc = may_context_mount_inode_relabel(defcontext_sid, sbsec, cred); if (rc) goto out; } sbsec->def_sid = defcontext_sid; } out_set_opts: rc = sb_finish_set_opts(sb); out: mutex_unlock(&sbsec->lock); return rc; out_double_mount: rc = -EINVAL; pr_warn("SELinux: mount invalid. Same superblock, different " "security settings for (dev %s, type %s)\n", sb->s_id, sb->s_type->name); goto out; } static int selinux_cmp_sb_context(const struct super_block *oldsb, const struct super_block *newsb) { struct superblock_security_struct *old = selinux_superblock(oldsb); struct superblock_security_struct *new = selinux_superblock(newsb); char oldflags = old->flags & SE_MNTMASK; char newflags = new->flags & SE_MNTMASK; if (oldflags != newflags) goto mismatch; if ((oldflags & FSCONTEXT_MNT) && old->sid != new->sid) goto mismatch; if ((oldflags & CONTEXT_MNT) && old->mntpoint_sid != new->mntpoint_sid) goto mismatch; if ((oldflags & DEFCONTEXT_MNT) && old->def_sid != new->def_sid) goto mismatch; if (oldflags & ROOTCONTEXT_MNT) { struct inode_security_struct *oldroot = backing_inode_security(oldsb->s_root); struct inode_security_struct *newroot = backing_inode_security(newsb->s_root); if (oldroot->sid != newroot->sid) goto mismatch; } return 0; mismatch: pr_warn("SELinux: mount invalid. Same superblock, " "different security settings for (dev %s, " "type %s)\n", newsb->s_id, newsb->s_type->name); return -EBUSY; } static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) { int rc = 0; const struct superblock_security_struct *oldsbsec = selinux_superblock(oldsb); struct superblock_security_struct *newsbsec = selinux_superblock(newsb); int set_fscontext = (oldsbsec->flags & FSCONTEXT_MNT); int set_context = (oldsbsec->flags & CONTEXT_MNT); int set_rootcontext = (oldsbsec->flags & ROOTCONTEXT_MNT); /* * Specifying internal flags without providing a place to * place the results is not allowed. */ if (kern_flags && !set_kern_flags) return -EINVAL; mutex_lock(&newsbsec->lock); /* * if the parent was able to be mounted it clearly had no special lsm * mount options. thus we can safely deal with this superblock later */ if (!selinux_initialized()) { if (kern_flags & SECURITY_LSM_NATIVE_LABELS) { newsbsec->flags |= SE_SBNATIVE; *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; } goto out; } /* how can we clone if the old one wasn't set up?? */ BUG_ON(!(oldsbsec->flags & SE_SBINITIALIZED)); /* if fs is reusing a sb, make sure that the contexts match */ if (newsbsec->flags & SE_SBINITIALIZED) { mutex_unlock(&newsbsec->lock); if ((kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; return selinux_cmp_sb_context(oldsb, newsb); } newsbsec->flags = oldsbsec->flags; newsbsec->sid = oldsbsec->sid; newsbsec->def_sid = oldsbsec->def_sid; newsbsec->behavior = oldsbsec->behavior; if (newsbsec->behavior == SECURITY_FS_USE_NATIVE && !(kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) { rc = security_fs_use(newsb); if (rc) goto out; } if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !set_context) { newsbsec->behavior = SECURITY_FS_USE_NATIVE; *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; } if (set_context) { u32 sid = oldsbsec->mntpoint_sid; if (!set_fscontext) newsbsec->sid = sid; if (!set_rootcontext) { struct inode_security_struct *newisec = backing_inode_security(newsb->s_root); newisec->sid = sid; } newsbsec->mntpoint_sid = sid; } if (set_rootcontext) { const struct inode_security_struct *oldisec = backing_inode_security(oldsb->s_root); struct inode_security_struct *newisec = backing_inode_security(newsb->s_root); newisec->sid = oldisec->sid; } sb_finish_set_opts(newsb); out: mutex_unlock(&newsbsec->lock); return rc; } /* * NOTE: the caller is responsible for freeing the memory even if on error. */ static int selinux_add_opt(int token, const char *s, void **mnt_opts) { struct selinux_mnt_opts *opts = *mnt_opts; u32 *dst_sid; int rc; if (token == Opt_seclabel) /* eaten and completely ignored */ return 0; if (!s) return -EINVAL; if (!selinux_initialized()) { pr_warn("SELinux: Unable to set superblock options before the security server is initialized\n"); return -EINVAL; } if (!opts) { opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) return -ENOMEM; *mnt_opts = opts; } switch (token) { case Opt_context: if (opts->context_sid || opts->defcontext_sid) goto err; dst_sid = &opts->context_sid; break; case Opt_fscontext: if (opts->fscontext_sid) goto err; dst_sid = &opts->fscontext_sid; break; case Opt_rootcontext: if (opts->rootcontext_sid) goto err; dst_sid = &opts->rootcontext_sid; break; case Opt_defcontext: if (opts->context_sid || opts->defcontext_sid) goto err; dst_sid = &opts->defcontext_sid; break; default: WARN_ON(1); return -EINVAL; } rc = security_context_str_to_sid(s, dst_sid, GFP_KERNEL); if (rc) pr_warn("SELinux: security_context_str_to_sid (%s) failed with errno=%d\n", s, rc); return rc; err: pr_warn(SEL_MOUNT_FAIL_MSG); return -EINVAL; } static int show_sid(struct seq_file *m, u32 sid) { char *context = NULL; u32 len; int rc; rc = security_sid_to_context(sid, &context, &len); if (!rc) { bool has_comma = strchr(context, ','); seq_putc(m, '='); if (has_comma) seq_putc(m, '\"'); seq_escape(m, context, "\"\n\\"); if (has_comma) seq_putc(m, '\"'); } kfree(context); return rc; } static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb) { struct superblock_security_struct *sbsec = selinux_superblock(sb); int rc; if (!(sbsec->flags & SE_SBINITIALIZED)) return 0; if (!selinux_initialized()) return 0; if (sbsec->flags & FSCONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, FSCONTEXT_STR); rc = show_sid(m, sbsec->sid); if (rc) return rc; } if (sbsec->flags & CONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, CONTEXT_STR); rc = show_sid(m, sbsec->mntpoint_sid); if (rc) return rc; } if (sbsec->flags & DEFCONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, DEFCONTEXT_STR); rc = show_sid(m, sbsec->def_sid); if (rc) return rc; } if (sbsec->flags & ROOTCONTEXT_MNT) { struct dentry *root = sb->s_root; struct inode_security_struct *isec = backing_inode_security(root); seq_putc(m, ','); seq_puts(m, ROOTCONTEXT_STR); rc = show_sid(m, isec->sid); if (rc) return rc; } if (sbsec->flags & SBLABEL_MNT) { seq_putc(m, ','); seq_puts(m, SECLABEL_STR); } return 0; } static inline u16 inode_mode_to_security_class(umode_t mode) { switch (mode & S_IFMT) { case S_IFSOCK: return SECCLASS_SOCK_FILE; case S_IFLNK: return SECCLASS_LNK_FILE; case S_IFREG: return SECCLASS_FILE; case S_IFBLK: return SECCLASS_BLK_FILE; case S_IFDIR: return SECCLASS_DIR; case S_IFCHR: return SECCLASS_CHR_FILE; case S_IFIFO: return SECCLASS_FIFO_FILE; } return SECCLASS_FILE; } static inline int default_protocol_stream(int protocol) { return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP || protocol == IPPROTO_MPTCP); } static inline int default_protocol_dgram(int protocol) { return (protocol == IPPROTO_IP || protocol == IPPROTO_UDP); } static inline u16 socket_type_to_security_class(int family, int type, int protocol) { bool extsockclass = selinux_policycap_extsockclass(); switch (family) { case PF_UNIX: switch (type) { case SOCK_STREAM: case SOCK_SEQPACKET: return SECCLASS_UNIX_STREAM_SOCKET; case SOCK_DGRAM: case SOCK_RAW: return SECCLASS_UNIX_DGRAM_SOCKET; } break; case PF_INET: case PF_INET6: switch (type) { case SOCK_STREAM: case SOCK_SEQPACKET: if (default_protocol_stream(protocol)) return SECCLASS_TCP_SOCKET; else if (extsockclass && protocol == IPPROTO_SCTP) return SECCLASS_SCTP_SOCKET; else return SECCLASS_RAWIP_SOCKET; case SOCK_DGRAM: if (default_protocol_dgram(protocol)) return SECCLASS_UDP_SOCKET; else if (extsockclass && (protocol == IPPROTO_ICMP || protocol == IPPROTO_ICMPV6)) return SECCLASS_ICMP_SOCKET; else return SECCLASS_RAWIP_SOCKET; default: return SECCLASS_RAWIP_SOCKET; } break; case PF_NETLINK: switch (protocol) { case NETLINK_ROUTE: return SECCLASS_NETLINK_ROUTE_SOCKET; case NETLINK_SOCK_DIAG: return SECCLASS_NETLINK_TCPDIAG_SOCKET; case NETLINK_NFLOG: return SECCLASS_NETLINK_NFLOG_SOCKET; case NETLINK_XFRM: return SECCLASS_NETLINK_XFRM_SOCKET; case NETLINK_SELINUX: return SECCLASS_NETLINK_SELINUX_SOCKET; case NETLINK_ISCSI: return SECCLASS_NETLINK_ISCSI_SOCKET; case NETLINK_AUDIT: return SECCLASS_NETLINK_AUDIT_SOCKET; case NETLINK_FIB_LOOKUP: return SECCLASS_NETLINK_FIB_LOOKUP_SOCKET; case NETLINK_CONNECTOR: return SECCLASS_NETLINK_CONNECTOR_SOCKET; case NETLINK_NETFILTER: return SECCLASS_NETLINK_NETFILTER_SOCKET; case NETLINK_DNRTMSG: return SECCLASS_NETLINK_DNRT_SOCKET; case NETLINK_KOBJECT_UEVENT: return SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET; case NETLINK_GENERIC: return SECCLASS_NETLINK_GENERIC_SOCKET; case NETLINK_SCSITRANSPORT: return SECCLASS_NETLINK_SCSITRANSPORT_SOCKET; case NETLINK_RDMA: return SECCLASS_NETLINK_RDMA_SOCKET; case NETLINK_CRYPTO: return SECCLASS_NETLINK_CRYPTO_SOCKET; default: return SECCLASS_NETLINK_SOCKET; } case PF_PACKET: return SECCLASS_PACKET_SOCKET; case PF_KEY: return SECCLASS_KEY_SOCKET; case PF_APPLETALK: return SECCLASS_APPLETALK_SOCKET; } if (extsockclass) { switch (family) { case PF_AX25: return SECCLASS_AX25_SOCKET; case PF_IPX: return SECCLASS_IPX_SOCKET; case PF_NETROM: return SECCLASS_NETROM_SOCKET; case PF_ATMPVC: return SECCLASS_ATMPVC_SOCKET; case PF_X25: return SECCLASS_X25_SOCKET; case PF_ROSE: return SECCLASS_ROSE_SOCKET; case PF_DECnet: return SECCLASS_DECNET_SOCKET; case PF_ATMSVC: return SECCLASS_ATMSVC_SOCKET; case PF_RDS: return SECCLASS_RDS_SOCKET; case PF_IRDA: return SECCLASS_IRDA_SOCKET; case PF_PPPOX: return SECCLASS_PPPOX_SOCKET; case PF_LLC: return SECCLASS_LLC_SOCKET; case PF_CAN: return SECCLASS_CAN_SOCKET; case PF_TIPC: return SECCLASS_TIPC_SOCKET; case PF_BLUETOOTH: return SECCLASS_BLUETOOTH_SOCKET; case PF_IUCV: return SECCLASS_IUCV_SOCKET; case PF_RXRPC: return SECCLASS_RXRPC_SOCKET; case PF_ISDN: return SECCLASS_ISDN_SOCKET; case PF_PHONET: return SECCLASS_PHONET_SOCKET; case PF_IEEE802154: return SECCLASS_IEEE802154_SOCKET; case PF_CAIF: return SECCLASS_CAIF_SOCKET; case PF_ALG: return SECCLASS_ALG_SOCKET; case PF_NFC: return SECCLASS_NFC_SOCKET; case PF_VSOCK: return SECCLASS_VSOCK_SOCKET; case PF_KCM: return SECCLASS_KCM_SOCKET; case PF_QIPCRTR: return SECCLASS_QIPCRTR_SOCKET; case PF_SMC: return SECCLASS_SMC_SOCKET; case PF_XDP: return SECCLASS_XDP_SOCKET; case PF_MCTP: return SECCLASS_MCTP_SOCKET; #if PF_MAX > 46 #error New address family defined, please update this function. #endif } } return SECCLASS_SOCKET; } static int selinux_genfs_get_sid(struct dentry *dentry, u16 tclass, u16 flags, u32 *sid) { int rc; struct super_block *sb = dentry->d_sb; char *buffer, *path; buffer = (char *)__get_free_page(GFP_KERNEL); if (!buffer) return -ENOMEM; path = dentry_path_raw(dentry, buffer, PAGE_SIZE); if (IS_ERR(path)) rc = PTR_ERR(path); else { if (flags & SE_SBPROC) { /* each process gets a /proc/PID/ entry. Strip off the * PID part to get a valid selinux labeling. * e.g. /proc/1/net/rpc/nfs -> /net/rpc/nfs */ while (path[1] >= '0' && path[1] <= '9') { path[1] = '/'; path++; } } rc = security_genfs_sid(sb->s_type->name, path, tclass, sid); if (rc == -ENOENT) { /* No match in policy, mark as unlabeled. */ *sid = SECINITSID_UNLABELED; rc = 0; } } free_page((unsigned long)buffer); return rc; } static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry, u32 def_sid, u32 *sid) { #define INITCONTEXTLEN 255 char *context; unsigned int len; int rc; len = INITCONTEXTLEN; context = kmalloc(len + 1, GFP_NOFS); if (!context) return -ENOMEM; context[len] = '\0'; rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len); if (rc == -ERANGE) { kfree(context); /* Need a larger buffer. Query for the right size. */ rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, NULL, 0); if (rc < 0) return rc; len = rc; context = kmalloc(len + 1, GFP_NOFS); if (!context) return -ENOMEM; context[len] = '\0'; rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len); } if (rc < 0) { kfree(context); if (rc != -ENODATA) { pr_warn("SELinux: %s: getxattr returned %d for dev=%s ino=%ld\n", __func__, -rc, inode->i_sb->s_id, inode->i_ino); return rc; } *sid = def_sid; return 0; } rc = security_context_to_sid_default(context, rc, sid, def_sid, GFP_NOFS); if (rc) { char *dev = inode->i_sb->s_id; unsigned long ino = inode->i_ino; if (rc == -EINVAL) { pr_notice_ratelimited("SELinux: inode=%lu on dev=%s was found to have an invalid context=%s. This indicates you may need to relabel the inode or the filesystem in question.\n", ino, dev, context); } else { pr_warn("SELinux: %s: context_to_sid(%s) returned %d for dev=%s ino=%ld\n", __func__, context, -rc, dev, ino); } } kfree(context); return 0; } /* The inode's security attributes must be initialized before first use. */ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry) { struct superblock_security_struct *sbsec = NULL; struct inode_security_struct *isec = selinux_inode(inode); u32 task_sid, sid = 0; u16 sclass; struct dentry *dentry; int rc = 0; if (isec->initialized == LABEL_INITIALIZED) return 0; spin_lock(&isec->lock); if (isec->initialized == LABEL_INITIALIZED) goto out_unlock; if (isec->sclass == SECCLASS_FILE) isec->sclass = inode_mode_to_security_class(inode->i_mode); sbsec = selinux_superblock(inode->i_sb); if (!(sbsec->flags & SE_SBINITIALIZED)) { /* Defer initialization until selinux_complete_init, after the initial policy is loaded and the security server is ready to handle calls. */ spin_lock(&sbsec->isec_lock); if (list_empty(&isec->list)) list_add(&isec->list, &sbsec->isec_head); spin_unlock(&sbsec->isec_lock); goto out_unlock; } sclass = isec->sclass; task_sid = isec->task_sid; sid = isec->sid; isec->initialized = LABEL_PENDING; spin_unlock(&isec->lock); switch (sbsec->behavior) { /* * In case of SECURITY_FS_USE_NATIVE we need to re-fetch the labels * via xattr when called from delayed_superblock_init(). */ case SECURITY_FS_USE_NATIVE: case SECURITY_FS_USE_XATTR: if (!(inode->i_opflags & IOP_XATTR)) { sid = sbsec->def_sid; break; } /* Need a dentry, since the xattr API requires one. Life would be simpler if we could just pass the inode. */ if (opt_dentry) { /* Called from d_instantiate or d_splice_alias. */ dentry = dget(opt_dentry); } else { /* * Called from selinux_complete_init, try to find a dentry. * Some filesystems really want a connected one, so try * that first. We could split SECURITY_FS_USE_XATTR in * two, depending upon that... */ dentry = d_find_alias(inode); if (!dentry) dentry = d_find_any_alias(inode); } if (!dentry) { /* * this is can be hit on boot when a file is accessed * before the policy is loaded. When we load policy we * may find inodes that have no dentry on the * sbsec->isec_head list. No reason to complain as these * will get fixed up the next time we go through * inode_doinit with a dentry, before these inodes could * be used again by userspace. */ goto out_invalid; } rc = inode_doinit_use_xattr(inode, dentry, sbsec->def_sid, &sid); dput(dentry); if (rc) goto out; break; case SECURITY_FS_USE_TASK: sid = task_sid; break; case SECURITY_FS_USE_TRANS: /* Default to the fs SID. */ sid = sbsec->sid; /* Try to obtain a transition SID. */ rc = security_transition_sid(task_sid, sid, sclass, NULL, &sid); if (rc) goto out; break; case SECURITY_FS_USE_MNTPOINT: sid = sbsec->mntpoint_sid; break; default: /* Default to the fs superblock SID. */ sid = sbsec->sid; if ((sbsec->flags & SE_SBGENFS) && (!S_ISLNK(inode->i_mode) || selinux_policycap_genfs_seclabel_symlinks())) { /* We must have a dentry to determine the label on * procfs inodes */ if (opt_dentry) { /* Called from d_instantiate or * d_splice_alias. */ dentry = dget(opt_dentry); } else { /* Called from selinux_complete_init, try to * find a dentry. Some filesystems really want * a connected one, so try that first. */ dentry = d_find_alias(inode); if (!dentry) dentry = d_find_any_alias(inode); } /* * This can be hit on boot when a file is accessed * before the policy is loaded. When we load policy we * may find inodes that have no dentry on the * sbsec->isec_head list. No reason to complain as * these will get fixed up the next time we go through * inode_doinit() with a dentry, before these inodes * could be used again by userspace. */ if (!dentry) goto out_invalid; rc = selinux_genfs_get_sid(dentry, sclass, sbsec->flags, &sid); if (rc) { dput(dentry); goto out; } if ((sbsec->flags & SE_SBGENFS_XATTR) && (inode->i_opflags & IOP_XATTR)) { rc = inode_doinit_use_xattr(inode, dentry, sid, &sid); if (rc) { dput(dentry); goto out; } } dput(dentry); } break; } out: spin_lock(&isec->lock); if (isec->initialized == LABEL_PENDING) { if (rc) { isec->initialized = LABEL_INVALID; goto out_unlock; } isec->initialized = LABEL_INITIALIZED; isec->sid = sid; } out_unlock: spin_unlock(&isec->lock); return rc; out_invalid: spin_lock(&isec->lock); if (isec->initialized == LABEL_PENDING) { isec->initialized = LABEL_INVALID; isec->sid = sid; } spin_unlock(&isec->lock); return 0; } /* Convert a Linux signal to an access vector. */ static inline u32 signal_to_av(int sig) { u32 perm = 0; switch (sig) { case SIGCHLD: /* Commonly granted from child to parent. */ perm = PROCESS__SIGCHLD; break; case SIGKILL: /* Cannot be caught or ignored */ perm = PROCESS__SIGKILL; break; case SIGSTOP: /* Cannot be caught or ignored */ perm = PROCESS__SIGSTOP; break; default: /* All other signals. */ perm = PROCESS__SIGNAL; break; } return perm; } #if CAP_LAST_CAP > 63 #error Fix SELinux to handle capabilities > 63. #endif /* Check whether a task is allowed to use a capability. */ static int cred_has_capability(const struct cred *cred, int cap, unsigned int opts, bool initns) { struct common_audit_data ad; struct av_decision avd; u16 sclass; u32 sid = cred_sid(cred); u32 av = CAP_TO_MASK(cap); int rc; ad.type = LSM_AUDIT_DATA_CAP; ad.u.cap = cap; switch (CAP_TO_INDEX(cap)) { case 0: sclass = initns ? SECCLASS_CAPABILITY : SECCLASS_CAP_USERNS; break; case 1: sclass = initns ? SECCLASS_CAPABILITY2 : SECCLASS_CAP2_USERNS; break; default: pr_err("SELinux: out of range capability %d\n", cap); BUG(); return -EINVAL; } rc = avc_has_perm_noaudit(sid, sid, sclass, av, 0, &avd); if (!(opts & CAP_OPT_NOAUDIT)) { int rc2 = avc_audit(sid, sid, sclass, av, &avd, rc, &ad); if (rc2) return rc2; } return rc; } /* Check whether a task has a particular permission to an inode. The 'adp' parameter is optional and allows other audit data to be passed (e.g. the dentry). */ static int inode_has_perm(const struct cred *cred, struct inode *inode, u32 perms, struct common_audit_data *adp) { struct inode_security_struct *isec; u32 sid; if (unlikely(IS_PRIVATE(inode))) return 0; sid = cred_sid(cred); isec = selinux_inode(inode); return avc_has_perm(sid, isec->sid, isec->sclass, perms, adp); } /* Same as inode_has_perm, but pass explicit audit data containing the dentry to help the auditing code to more easily generate the pathname if needed. */ static inline int dentry_has_perm(const struct cred *cred, struct dentry *dentry, u32 av) { struct common_audit_data ad; struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec = selinux_inode(inode); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; /* check below is racy, but revalidate will recheck with lock held */ if (data_race(unlikely(isec->initialized != LABEL_INITIALIZED))) __inode_security_revalidate(inode, dentry, true); return inode_has_perm(cred, inode, av, &ad); } /* Same as inode_has_perm, but pass explicit audit data containing the path to help the auditing code to more easily generate the pathname if needed. */ static inline int path_has_perm(const struct cred *cred, const struct path *path, u32 av) { struct common_audit_data ad; struct inode *inode = d_backing_inode(path->dentry); struct inode_security_struct *isec = selinux_inode(inode); ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = *path; /* check below is racy, but revalidate will recheck with lock held */ if (data_race(unlikely(isec->initialized != LABEL_INITIALIZED))) __inode_security_revalidate(inode, path->dentry, true); return inode_has_perm(cred, inode, av, &ad); } /* Same as path_has_perm, but uses the inode from the file struct. */ static inline int file_path_has_perm(const struct cred *cred, struct file *file, u32 av) { struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; return inode_has_perm(cred, file_inode(file), av, &ad); } #ifdef CONFIG_BPF_SYSCALL static int bpf_fd_pass(const struct file *file, u32 sid); #endif /* Check whether a task can use an open file descriptor to access an inode in a given way. Check access to the descriptor itself, and then use dentry_has_perm to check a particular permission to the file. Access to the descriptor is implicitly granted if it has the same SID as the process. If av is zero, then access to the file is not checked, e.g. for cases where only the descriptor is affected like seek. */ static int file_has_perm(const struct cred *cred, struct file *file, u32 av) { struct file_security_struct *fsec = selinux_file(file); struct inode *inode = file_inode(file); struct common_audit_data ad; u32 sid = cred_sid(cred); int rc; ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; if (sid != fsec->sid) { rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) goto out; } #ifdef CONFIG_BPF_SYSCALL rc = bpf_fd_pass(file, cred_sid(cred)); if (rc) return rc; #endif /* av is zero if only checking access to the descriptor. */ rc = 0; if (av) rc = inode_has_perm(cred, inode, av, &ad); out: return rc; } /* * Determine the label for an inode that might be unioned. */ static int selinux_determine_inode_label(const struct cred_security_struct *crsec, struct inode *dir, const struct qstr *name, u16 tclass, u32 *_new_isid) { const struct superblock_security_struct *sbsec = selinux_superblock(dir->i_sb); if ((sbsec->flags & SE_SBINITIALIZED) && (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) { *_new_isid = sbsec->mntpoint_sid; } else if ((sbsec->flags & SBLABEL_MNT) && crsec->create_sid) { *_new_isid = crsec->create_sid; } else { const struct inode_security_struct *dsec = inode_security(dir); return security_transition_sid(crsec->sid, dsec->sid, tclass, name, _new_isid); } return 0; } /* Check whether a task can create a file. */ static int may_create(struct inode *dir, struct dentry *dentry, u16 tclass) { const struct cred_security_struct *crsec = selinux_cred(current_cred()); struct inode_security_struct *dsec; struct superblock_security_struct *sbsec; u32 sid, newsid; struct common_audit_data ad; int rc; dsec = inode_security(dir); sbsec = selinux_superblock(dir->i_sb); sid = crsec->sid; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; rc = avc_has_perm(sid, dsec->sid, SECCLASS_DIR, DIR__ADD_NAME | DIR__SEARCH, &ad); if (rc) return rc; rc = selinux_determine_inode_label(crsec, dir, &dentry->d_name, tclass, &newsid); if (rc) return rc; rc = avc_has_perm(sid, newsid, tclass, FILE__CREATE, &ad); if (rc) return rc; return avc_has_perm(newsid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, &ad); } #define MAY_LINK 0 #define MAY_UNLINK 1 #define MAY_RMDIR 2 /* Check whether a task can link, unlink, or rmdir a file/directory. */ static int may_link(struct inode *dir, struct dentry *dentry, int kind) { struct inode_security_struct *dsec, *isec; struct common_audit_data ad; u32 sid = current_sid(); u32 av; int rc; dsec = inode_security(dir); isec = backing_inode_security(dentry); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; av = DIR__SEARCH; av |= (kind ? DIR__REMOVE_NAME : DIR__ADD_NAME); rc = avc_has_perm(sid, dsec->sid, SECCLASS_DIR, av, &ad); if (rc) return rc; switch (kind) { case MAY_LINK: av = FILE__LINK; break; case MAY_UNLINK: av = FILE__UNLINK; break; case MAY_RMDIR: av = DIR__RMDIR; break; default: pr_warn("SELinux: %s: unrecognized kind %d\n", __func__, kind); return 0; } rc = avc_has_perm(sid, isec->sid, isec->sclass, av, &ad); return rc; } static inline int may_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode_security_struct *old_dsec, *new_dsec, *old_isec, *new_isec; struct common_audit_data ad; u32 sid = current_sid(); u32 av; int old_is_dir, new_is_dir; int rc; old_dsec = inode_security(old_dir); old_isec = backing_inode_security(old_dentry); old_is_dir = d_is_dir(old_dentry); new_dsec = inode_security(new_dir); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = old_dentry; rc = avc_has_perm(sid, old_dsec->sid, SECCLASS_DIR, DIR__REMOVE_NAME | DIR__SEARCH, &ad); if (rc) return rc; rc = avc_has_perm(sid, old_isec->sid, old_isec->sclass, FILE__RENAME, &ad); if (rc) return rc; if (old_is_dir && new_dir != old_dir) { rc = avc_has_perm(sid, old_isec->sid, old_isec->sclass, DIR__REPARENT, &ad); if (rc) return rc; } ad.u.dentry = new_dentry; av = DIR__ADD_NAME | DIR__SEARCH; if (d_is_positive(new_dentry)) av |= DIR__REMOVE_NAME; rc = avc_has_perm(sid, new_dsec->sid, SECCLASS_DIR, av, &ad); if (rc) return rc; if (d_is_positive(new_dentry)) { new_isec = backing_inode_security(new_dentry); new_is_dir = d_is_dir(new_dentry); rc = avc_has_perm(sid, new_isec->sid, new_isec->sclass, (new_is_dir ? DIR__RMDIR : FILE__UNLINK), &ad); if (rc) return rc; } return 0; } /* Check whether a task can perform a filesystem operation. */ static int superblock_has_perm(const struct cred *cred, const struct super_block *sb, u32 perms, struct common_audit_data *ad) { struct superblock_security_struct *sbsec; u32 sid = cred_sid(cred); sbsec = selinux_superblock(sb); return avc_has_perm(sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad); } /* Convert a Linux mode and permission mask to an access vector. */ static inline u32 file_mask_to_av(int mode, int mask) { u32 av = 0; if (!S_ISDIR(mode)) { if (mask & MAY_EXEC) av |= FILE__EXECUTE; if (mask & MAY_READ) av |= FILE__READ; if (mask & MAY_APPEND) av |= FILE__APPEND; else if (mask & MAY_WRITE) av |= FILE__WRITE; } else { if (mask & MAY_EXEC) av |= DIR__SEARCH; if (mask & MAY_WRITE) av |= DIR__WRITE; if (mask & MAY_READ) av |= DIR__READ; } return av; } /* Convert a Linux file to an access vector. */ static inline u32 file_to_av(const struct file *file) { u32 av = 0; if (file->f_mode & FMODE_READ) av |= FILE__READ; if (file->f_mode & FMODE_WRITE) { if (file->f_flags & O_APPEND) av |= FILE__APPEND; else av |= FILE__WRITE; } if (!av) { /* * Special file opened with flags 3 for ioctl-only use. */ av = FILE__IOCTL; } return av; } /* * Convert a file to an access vector and include the correct * open permission. */ static inline u32 open_file_to_av(struct file *file) { u32 av = file_to_av(file); struct inode *inode = file_inode(file); if (selinux_policycap_openperm() && inode->i_sb->s_magic != SOCKFS_MAGIC) av |= FILE__OPEN; return av; } /* Hook functions begin here. */ static int selinux_binder_set_context_mgr(const struct cred *mgr) { return avc_has_perm(current_sid(), cred_sid(mgr), SECCLASS_BINDER, BINDER__SET_CONTEXT_MGR, NULL); } static int selinux_binder_transaction(const struct cred *from, const struct cred *to) { u32 mysid = current_sid(); u32 fromsid = cred_sid(from); u32 tosid = cred_sid(to); int rc; if (mysid != fromsid) { rc = avc_has_perm(mysid, fromsid, SECCLASS_BINDER, BINDER__IMPERSONATE, NULL); if (rc) return rc; } return avc_has_perm(fromsid, tosid, SECCLASS_BINDER, BINDER__CALL, NULL); } static int selinux_binder_transfer_binder(const struct cred *from, const struct cred *to) { return avc_has_perm(cred_sid(from), cred_sid(to), SECCLASS_BINDER, BINDER__TRANSFER, NULL); } static int selinux_binder_transfer_file(const struct cred *from, const struct cred *to, const struct file *file) { u32 sid = cred_sid(to); struct file_security_struct *fsec = selinux_file(file); struct dentry *dentry = file->f_path.dentry; struct inode_security_struct *isec; struct common_audit_data ad; int rc; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = file->f_path; if (sid != fsec->sid) { rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) return rc; } #ifdef CONFIG_BPF_SYSCALL rc = bpf_fd_pass(file, sid); if (rc) return rc; #endif if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; isec = backing_inode_security(dentry); return avc_has_perm(sid, isec->sid, isec->sclass, file_to_av(file), &ad); } static int selinux_ptrace_access_check(struct task_struct *child, unsigned int mode) { u32 sid = current_sid(); u32 csid = task_sid_obj(child); if (mode & PTRACE_MODE_READ) return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ, NULL); return avc_has_perm(sid, csid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); } static int selinux_ptrace_traceme(struct task_struct *parent) { return avc_has_perm(task_sid_obj(parent), task_sid_obj(current), SECCLASS_PROCESS, PROCESS__PTRACE, NULL); } static int selinux_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { return avc_has_perm(current_sid(), task_sid_obj(target), SECCLASS_PROCESS, PROCESS__GETCAP, NULL); } static int selinux_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { return avc_has_perm(cred_sid(old), cred_sid(new), SECCLASS_PROCESS, PROCESS__SETCAP, NULL); } /* * (This comment used to live with the selinux_task_setuid hook, * which was removed). * * Since setuid only affects the current process, and since the SELinux * controls are not based on the Linux identity attributes, SELinux does not * need to control this operation. However, SELinux does control the use of * the CAP_SETUID and CAP_SETGID capabilities using the capable hook. */ static int selinux_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { return cred_has_capability(cred, cap, opts, ns == &init_user_ns); } static int selinux_quotactl(int cmds, int type, int id, const struct super_block *sb) { const struct cred *cred = current_cred(); int rc = 0; if (!sb) return 0; switch (cmds) { case Q_SYNC: case Q_QUOTAON: case Q_QUOTAOFF: case Q_SETINFO: case Q_SETQUOTA: case Q_XQUOTAOFF: case Q_XQUOTAON: case Q_XSETQLIM: rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAMOD, NULL); break; case Q_GETFMT: case Q_GETINFO: case Q_GETQUOTA: case Q_XGETQUOTA: case Q_XGETQSTAT: case Q_XGETQSTATV: case Q_XGETNEXTQUOTA: rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAGET, NULL); break; default: rc = 0; /* let the kernel handle invalid cmds */ break; } return rc; } static int selinux_quota_on(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__QUOTAON); } static int selinux_syslog(int type) { switch (type) { case SYSLOG_ACTION_READ_ALL: /* Read last kernel messages */ case SYSLOG_ACTION_SIZE_BUFFER: /* Return size of the log buffer */ return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_READ, NULL); case SYSLOG_ACTION_CONSOLE_OFF: /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: /* Enable logging to console */ /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_CONSOLE, NULL); } /* All other syslog types */ return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_MOD, NULL); } /* * Check permission for allocating a new virtual mapping. Returns * 0 if permission is granted, negative error code if not. * * Do not audit the selinux permission check, as this is applied to all * processes that allocate mappings. */ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages) { return cred_has_capability(current_cred(), CAP_SYS_ADMIN, CAP_OPT_NOAUDIT, true); } /* binprm security operations */ static u32 ptrace_parent_sid(void) { u32 sid = 0; struct task_struct *tracer; rcu_read_lock(); tracer = ptrace_parent(current); if (tracer) sid = task_sid_obj(tracer); rcu_read_unlock(); return sid; } static int check_nnp_nosuid(const struct linux_binprm *bprm, const struct cred_security_struct *old_crsec, const struct cred_security_struct *new_crsec) { int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS); int nosuid = !mnt_may_suid(bprm->file->f_path.mnt); int rc; u32 av; if (!nnp && !nosuid) return 0; /* neither NNP nor nosuid */ if (new_crsec->sid == old_crsec->sid) return 0; /* No change in credentials */ /* * If the policy enables the nnp_nosuid_transition policy capability, * then we permit transitions under NNP or nosuid if the * policy allows the corresponding permission between * the old and new contexts. */ if (selinux_policycap_nnp_nosuid_transition()) { av = 0; if (nnp) av |= PROCESS2__NNP_TRANSITION; if (nosuid) av |= PROCESS2__NOSUID_TRANSITION; rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS2, av, NULL); if (!rc) return 0; } /* * We also permit NNP or nosuid transitions to bounded SIDs, * i.e. SIDs that are guaranteed to only be allowed a subset * of the permissions of the current SID. */ rc = security_bounded_transition(old_crsec->sid, new_crsec->sid); if (!rc) return 0; /* * On failure, preserve the errno values for NNP vs nosuid. * NNP: Operation not permitted for caller. * nosuid: Permission denied to file. */ if (nnp) return -EPERM; return -EACCES; } static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) { const struct cred_security_struct *old_crsec; struct cred_security_struct *new_crsec; struct inode_security_struct *isec; struct common_audit_data ad; struct inode *inode = file_inode(bprm->file); int rc; /* SELinux context only depends on initial program or script and not * the script interpreter */ old_crsec = selinux_cred(current_cred()); new_crsec = selinux_cred(bprm->cred); isec = inode_security(inode); if (WARN_ON(isec->sclass != SECCLASS_FILE && isec->sclass != SECCLASS_MEMFD_FILE)) return -EACCES; /* Default to the current task SID. */ new_crsec->sid = old_crsec->sid; new_crsec->osid = old_crsec->sid; /* Reset fs, key, and sock SIDs on execve. */ new_crsec->create_sid = 0; new_crsec->keycreate_sid = 0; new_crsec->sockcreate_sid = 0; /* * Before policy is loaded, label any task outside kernel space * as SECINITSID_INIT, so that any userspace tasks surviving from * early boot end up with a label different from SECINITSID_KERNEL * (if the policy chooses to set SECINITSID_INIT != SECINITSID_KERNEL). */ if (!selinux_initialized()) { new_crsec->sid = SECINITSID_INIT; /* also clear the exec_sid just in case */ new_crsec->exec_sid = 0; return 0; } if (old_crsec->exec_sid) { new_crsec->sid = old_crsec->exec_sid; /* Reset exec SID on execve. */ new_crsec->exec_sid = 0; /* Fail on NNP or nosuid if not an allowed transition. */ rc = check_nnp_nosuid(bprm, old_crsec, new_crsec); if (rc) return rc; } else { /* Check for a default transition on this program. */ rc = security_transition_sid(old_crsec->sid, isec->sid, SECCLASS_PROCESS, NULL, &new_crsec->sid); if (rc) return rc; /* * Fallback to old SID on NNP or nosuid if not an allowed * transition. */ rc = check_nnp_nosuid(bprm, old_crsec, new_crsec); if (rc) new_crsec->sid = old_crsec->sid; } ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = bprm->file; if (new_crsec->sid == old_crsec->sid) { rc = avc_has_perm(old_crsec->sid, isec->sid, isec->sclass, FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { /* Check permissions for the transition. */ rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__TRANSITION, &ad); if (rc) return rc; rc = avc_has_perm(new_crsec->sid, isec->sid, isec->sclass, FILE__ENTRYPOINT, &ad); if (rc) return rc; /* Check for shared state */ if (bprm->unsafe & LSM_UNSAFE_SHARE) { rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__SHARE, NULL); if (rc) return -EPERM; } /* Make sure that anyone attempting to ptrace over a task that * changes its SID has the appropriate permit */ if (bprm->unsafe & LSM_UNSAFE_PTRACE) { u32 ptsid = ptrace_parent_sid(); if (ptsid != 0) { rc = avc_has_perm(ptsid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (rc) return -EPERM; } } /* Clear any possibly unsafe personality bits on exec: */ bprm->per_clear |= PER_CLEAR_ON_SETID; /* Enable secure mode for SIDs transitions unless the noatsecure permission is granted between the two SIDs, i.e. ahp returns 0. */ rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__NOATSECURE, NULL); bprm->secureexec |= !!rc; } return 0; } static int match_file(const void *p, struct file *file, unsigned fd) { return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0; } /* Derived from fs/exec.c:flush_old_files. */ static inline void flush_unauthorized_files(const struct cred *cred, struct files_struct *files) { struct file *file, *devnull = NULL; struct tty_struct *tty; int drop_tty = 0; unsigned n; tty = get_current_tty(); if (tty) { spin_lock(&tty->files_lock); if (!list_empty(&tty->tty_files)) { struct tty_file_private *file_priv; /* Revalidate access to controlling tty. Use file_path_has_perm on the tty path directly rather than using file_has_perm, as this particular open file may belong to another process and we are only interested in the inode-based check here. */ file_priv = list_first_entry(&tty->tty_files, struct tty_file_private, list); file = file_priv->file; if (file_path_has_perm(cred, file, FILE__READ | FILE__WRITE)) drop_tty = 1; } spin_unlock(&tty->files_lock); tty_kref_put(tty); } /* Reset controlling tty. */ if (drop_tty) no_tty(); /* Revalidate access to inherited open files. */ n = iterate_fd(files, 0, match_file, cred); if (!n) /* none found? */ return; devnull = dentry_open(&selinux_null, O_RDWR, cred); if (IS_ERR(devnull)) devnull = NULL; /* replace all the matching ones with this */ do { replace_fd(n - 1, devnull, 0); } while ((n = iterate_fd(files, n, match_file, cred)) != 0); if (devnull) fput(devnull); } /* * Prepare a process for imminent new credential changes due to exec */ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm) { struct cred_security_struct *new_crsec; struct rlimit *rlim, *initrlim; int rc, i; new_crsec = selinux_cred(bprm->cred); if (new_crsec->sid == new_crsec->osid) return; /* Close files for which the new task SID is not authorized. */ flush_unauthorized_files(bprm->cred, current->files); /* Always clear parent death signal on SID transitions. */ current->pdeath_signal = 0; /* Check whether the new SID can inherit resource limits from the old * SID. If not, reset all soft limits to the lower of the current * task's hard limit and the init task's soft limit. * * Note that the setting of hard limits (even to lower them) can be * controlled by the setrlimit check. The inclusion of the init task's * soft limit into the computation is to avoid resetting soft limits * higher than the default soft limit for cases where the default is * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK. */ rc = avc_has_perm(new_crsec->osid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__RLIMITINH, NULL); if (rc) { /* protect against do_prlimit() */ task_lock(current); for (i = 0; i < RLIM_NLIMITS; i++) { rlim = current->signal->rlim + i; initrlim = init_task.signal->rlim + i; rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } task_unlock(current); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) update_rlimit_cpu(current, rlimit(RLIMIT_CPU)); } } /* * Clean up the process immediately after the installation of new credentials * due to exec */ static void selinux_bprm_committed_creds(const struct linux_binprm *bprm) { const struct cred_security_struct *crsec = selinux_cred(current_cred()); u32 osid, sid; int rc; osid = crsec->osid; sid = crsec->sid; if (sid == osid) return; /* Check whether the new SID can inherit signal state from the old SID. * If not, clear itimers to avoid subsequent signal generation and * flush and unblock signals. * * This must occur _after_ the task SID has been updated so that any * kill done after the flush will be checked against the new SID. */ rc = avc_has_perm(osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL); if (rc) { clear_itimer(); spin_lock_irq(&unrcu_pointer(current->sighand)->siglock); if (!fatal_signal_pending(current)) { flush_sigqueue(¤t->pending); flush_sigqueue(¤t->signal->shared_pending); flush_signal_handlers(current, 1); sigemptyset(¤t->blocked); recalc_sigpending(); } spin_unlock_irq(&unrcu_pointer(current->sighand)->siglock); } /* Wake up the parent if it is waiting so that it can recheck * wait permission to the new task SID. */ read_lock(&tasklist_lock); __wake_up_parent(current, unrcu_pointer(current->real_parent)); read_unlock(&tasklist_lock); } /* superblock security operations */ static int selinux_sb_alloc_security(struct super_block *sb) { struct superblock_security_struct *sbsec = selinux_superblock(sb); mutex_init(&sbsec->lock); INIT_LIST_HEAD(&sbsec->isec_head); spin_lock_init(&sbsec->isec_lock); sbsec->sid = SECINITSID_UNLABELED; sbsec->def_sid = SECINITSID_FILE; sbsec->mntpoint_sid = SECINITSID_UNLABELED; return 0; } static inline int opt_len(const char *s) { bool open_quote = false; int len; char c; for (len = 0; (c = s[len]) != '\0'; len++) { if (c == '"') open_quote = !open_quote; if (c == ',' && !open_quote) break; } return len; } static int selinux_sb_eat_lsm_opts(char *options, void **mnt_opts) { char *from = options; char *to = options; bool first = true; int rc; while (1) { int len = opt_len(from); int token; char *arg = NULL; token = match_opt_prefix(from, len, &arg); if (token != Opt_error) { char *p, *q; /* strip quotes */ if (arg) { for (p = q = arg; p < from + len; p++) { char c = *p; if (c != '"') *q++ = c; } arg = kmemdup_nul(arg, q - arg, GFP_KERNEL); if (!arg) { rc = -ENOMEM; goto free_opt; } } rc = selinux_add_opt(token, arg, mnt_opts); kfree(arg); arg = NULL; if (unlikely(rc)) { goto free_opt; } } else { if (!first) { // copy with preceding comma from--; len++; } if (to != from) memmove(to, from, len); to += len; first = false; } if (!from[len]) break; from += len + 1; } *to = '\0'; return 0; free_opt: if (*mnt_opts) { selinux_free_mnt_opts(*mnt_opts); *mnt_opts = NULL; } return rc; } static int selinux_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts) { struct selinux_mnt_opts *opts = mnt_opts; struct superblock_security_struct *sbsec = selinux_superblock(sb); /* * Superblock not initialized (i.e. no options) - reject if any * options specified, otherwise accept. */ if (!(sbsec->flags & SE_SBINITIALIZED)) return opts ? 1 : 0; /* * Superblock initialized and no options specified - reject if * superblock has any options set, otherwise accept. */ if (!opts) return (sbsec->flags & SE_MNTMASK) ? 1 : 0; if (opts->fscontext_sid) { if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, opts->fscontext_sid)) return 1; } if (opts->context_sid) { if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, opts->context_sid)) return 1; } if (opts->rootcontext_sid) { struct inode_security_struct *root_isec; root_isec = backing_inode_security(sb->s_root); if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, opts->rootcontext_sid)) return 1; } if (opts->defcontext_sid) { if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, opts->defcontext_sid)) return 1; } return 0; } static int selinux_sb_remount(struct super_block *sb, void *mnt_opts) { struct selinux_mnt_opts *opts = mnt_opts; struct superblock_security_struct *sbsec = selinux_superblock(sb); if (!(sbsec->flags & SE_SBINITIALIZED)) return 0; if (!opts) return 0; if (opts->fscontext_sid) { if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, opts->fscontext_sid)) goto out_bad_option; } if (opts->context_sid) { if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, opts->context_sid)) goto out_bad_option; } if (opts->rootcontext_sid) { struct inode_security_struct *root_isec; root_isec = backing_inode_security(sb->s_root); if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, opts->rootcontext_sid)) goto out_bad_option; } if (opts->defcontext_sid) { if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, opts->defcontext_sid)) goto out_bad_option; } return 0; out_bad_option: pr_warn("SELinux: unable to change security options " "during remount (dev %s, type=%s)\n", sb->s_id, sb->s_type->name); return -EINVAL; } static int selinux_sb_kern_mount(const struct super_block *sb) { const struct cred *cred = current_cred(); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = sb->s_root; return superblock_has_perm(cred, sb, FILESYSTEM__MOUNT, &ad); } static int selinux_sb_statfs(struct dentry *dentry) { const struct cred *cred = current_cred(); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry->d_sb->s_root; return superblock_has_perm(cred, dentry->d_sb, FILESYSTEM__GETATTR, &ad); } static int selinux_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { const struct cred *cred = current_cred(); if (flags & MS_REMOUNT) return superblock_has_perm(cred, path->dentry->d_sb, FILESYSTEM__REMOUNT, NULL); else return path_has_perm(cred, path, FILE__MOUNTON); } static int selinux_move_mount(const struct path *from_path, const struct path *to_path) { const struct cred *cred = current_cred(); return path_has_perm(cred, to_path, FILE__MOUNTON); } static int selinux_umount(struct vfsmount *mnt, int flags) { const struct cred *cred = current_cred(); return superblock_has_perm(cred, mnt->mnt_sb, FILESYSTEM__UNMOUNT, NULL); } static int selinux_fs_context_submount(struct fs_context *fc, struct super_block *reference) { const struct superblock_security_struct *sbsec = selinux_superblock(reference); struct selinux_mnt_opts *opts; /* * Ensure that fc->security remains NULL when no options are set * as expected by selinux_set_mnt_opts(). */ if (!(sbsec->flags & (FSCONTEXT_MNT|CONTEXT_MNT|DEFCONTEXT_MNT))) return 0; opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) return -ENOMEM; if (sbsec->flags & FSCONTEXT_MNT) opts->fscontext_sid = sbsec->sid; if (sbsec->flags & CONTEXT_MNT) opts->context_sid = sbsec->mntpoint_sid; if (sbsec->flags & DEFCONTEXT_MNT) opts->defcontext_sid = sbsec->def_sid; fc->security = opts; return 0; } static int selinux_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { const struct selinux_mnt_opts *src = src_fc->security; if (!src) return 0; fc->security = kmemdup(src, sizeof(*src), GFP_KERNEL); return fc->security ? 0 : -ENOMEM; } static const struct fs_parameter_spec selinux_fs_parameters[] = { fsparam_string(CONTEXT_STR, Opt_context), fsparam_string(DEFCONTEXT_STR, Opt_defcontext), fsparam_string(FSCONTEXT_STR, Opt_fscontext), fsparam_string(ROOTCONTEXT_STR, Opt_rootcontext), fsparam_flag (SECLABEL_STR, Opt_seclabel), {} }; static int selinux_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; int opt; opt = fs_parse(fc, selinux_fs_parameters, param, &result); if (opt < 0) return opt; return selinux_add_opt(opt, param->string, &fc->security); } /* inode security operations */ static int selinux_inode_alloc_security(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); u32 sid = current_sid(); spin_lock_init(&isec->lock); INIT_LIST_HEAD(&isec->list); isec->inode = inode; isec->sid = SECINITSID_UNLABELED; isec->sclass = SECCLASS_FILE; isec->task_sid = sid; isec->initialized = LABEL_INVALID; return 0; } static void selinux_inode_free_security(struct inode *inode) { inode_free_security(inode); } static int selinux_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, struct lsm_context *cp) { u32 newsid; int rc; rc = selinux_determine_inode_label(selinux_cred(current_cred()), d_inode(dentry->d_parent), name, inode_mode_to_security_class(mode), &newsid); if (rc) return rc; if (xattr_name) *xattr_name = XATTR_NAME_SELINUX; cp->id = LSM_ID_SELINUX; return security_sid_to_context(newsid, &cp->context, &cp->len); } static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, const struct qstr *name, const struct cred *old, struct cred *new) { u32 newsid; int rc; struct cred_security_struct *crsec; rc = selinux_determine_inode_label(selinux_cred(old), d_inode(dentry->d_parent), name, inode_mode_to_security_class(mode), &newsid); if (rc) return rc; crsec = selinux_cred(new); crsec->create_sid = newsid; return 0; } static int selinux_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) { const struct cred_security_struct *crsec = selinux_cred(current_cred()); struct superblock_security_struct *sbsec; struct xattr *xattr = lsm_get_xattr_slot(xattrs, xattr_count); u32 newsid, clen; u16 newsclass; int rc; char *context; sbsec = selinux_superblock(dir->i_sb); newsid = crsec->create_sid; newsclass = inode_mode_to_security_class(inode->i_mode); rc = selinux_determine_inode_label(crsec, dir, qstr, newsclass, &newsid); if (rc) return rc; /* Possibly defer initialization to selinux_complete_init. */ if (sbsec->flags & SE_SBINITIALIZED) { struct inode_security_struct *isec = selinux_inode(inode); isec->sclass = newsclass; isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; } if (!selinux_initialized() || !(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (xattr) { rc = security_sid_to_context_force(newsid, &context, &clen); if (rc) return rc; xattr->value = context; xattr->value_len = clen; xattr->name = XATTR_SELINUX_SUFFIX; } return 0; } static int selinux_inode_init_security_anon(struct inode *inode, const struct qstr *name, const struct inode *context_inode) { u32 sid = current_sid(); struct common_audit_data ad; struct inode_security_struct *isec; int rc; bool is_memfd = false; if (unlikely(!selinux_initialized())) return 0; if (name != NULL && name->name != NULL && !strcmp(name->name, MEMFD_ANON_NAME)) { if (!selinux_policycap_memfd_class()) return 0; is_memfd = true; } isec = selinux_inode(inode); /* * We only get here once per ephemeral inode. The inode has * been initialized via inode_alloc_security but is otherwise * untouched. */ if (context_inode) { struct inode_security_struct *context_isec = selinux_inode(context_inode); if (context_isec->initialized != LABEL_INITIALIZED) { pr_err("SELinux: context_inode is not initialized\n"); return -EACCES; } isec->sclass = context_isec->sclass; isec->sid = context_isec->sid; } else { if (is_memfd) isec->sclass = SECCLASS_MEMFD_FILE; else isec->sclass = SECCLASS_ANON_INODE; rc = security_transition_sid( sid, sid, isec->sclass, name, &isec->sid); if (rc) return rc; } isec->initialized = LABEL_INITIALIZED; /* * Now that we've initialized security, check whether we're * allowed to actually create this type of anonymous inode. */ ad.type = LSM_AUDIT_DATA_ANONINODE; ad.u.anonclass = name ? (const char *)name->name : "?"; return avc_has_perm(sid, isec->sid, isec->sclass, FILE__CREATE, &ad); } static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) { return may_create(dir, dentry, SECCLASS_FILE); } static int selinux_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { return may_link(dir, old_dentry, MAY_LINK); } static int selinux_inode_unlink(struct inode *dir, struct dentry *dentry) { return may_link(dir, dentry, MAY_UNLINK); } static int selinux_inode_symlink(struct inode *dir, struct dentry *dentry, const char *name) { return may_create(dir, dentry, SECCLASS_LNK_FILE); } static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mask) { return may_create(dir, dentry, SECCLASS_DIR); } static int selinux_inode_rmdir(struct inode *dir, struct dentry *dentry) { return may_link(dir, dentry, MAY_RMDIR); } static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { return may_create(dir, dentry, inode_mode_to_security_class(mode)); } static int selinux_inode_rename(struct inode *old_inode, struct dentry *old_dentry, struct inode *new_inode, struct dentry *new_dentry) { return may_rename(old_inode, old_dentry, new_inode, new_dentry); } static int selinux_inode_readlink(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__READ); } static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) { struct common_audit_data ad; struct inode_security_struct *isec; u32 sid = current_sid(); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; isec = inode_security_rcu(inode, rcu); if (IS_ERR(isec)) return PTR_ERR(isec); return avc_has_perm(sid, isec->sid, isec->sclass, FILE__READ, &ad); } static noinline int audit_inode_permission(struct inode *inode, u32 perms, u32 audited, u32 denied, int result) { struct common_audit_data ad; struct inode_security_struct *isec = selinux_inode(inode); ad.type = LSM_AUDIT_DATA_INODE; ad.u.inode = inode; return slow_avc_audit(current_sid(), isec->sid, isec->sclass, perms, audited, denied, result, &ad); } /** * task_avdcache_reset - Reset the task's AVD cache * @tsec: the task's security state * * Clear the task's AVD cache in @tsec and reset it to the current policy's * and task's info. */ static inline void task_avdcache_reset(struct task_security_struct *tsec) { memset(&tsec->avdcache.dir, 0, sizeof(tsec->avdcache.dir)); tsec->avdcache.sid = current_sid(); tsec->avdcache.seqno = avc_policy_seqno(); tsec->avdcache.dir_spot = TSEC_AVDC_DIR_SIZE - 1; } /** * task_avdcache_search - Search the task's AVD cache * @tsec: the task's security state * @isec: the inode to search for in the cache * @avdc: matching avd cache entry returned to the caller * * Search @tsec for a AVD cache entry that matches @isec and return it to the * caller via @avdc. Returns 0 if a match is found, negative values otherwise. */ static inline int task_avdcache_search(struct task_security_struct *tsec, struct inode_security_struct *isec, struct avdc_entry **avdc) { int orig, iter; /* focused on path walk optimization, only cache directories */ if (isec->sclass != SECCLASS_DIR) return -ENOENT; if (unlikely(current_sid() != tsec->avdcache.sid || tsec->avdcache.seqno != avc_policy_seqno())) { task_avdcache_reset(tsec); return -ENOENT; } orig = iter = tsec->avdcache.dir_spot; do { if (tsec->avdcache.dir[iter].isid == isec->sid) { /* cache hit */ tsec->avdcache.dir_spot = iter; *avdc = &tsec->avdcache.dir[iter]; return 0; } iter = (iter - 1) & (TSEC_AVDC_DIR_SIZE - 1); } while (iter != orig); return -ENOENT; } /** * task_avdcache_update - Update the task's AVD cache * @tsec: the task's security state * @isec: the inode associated with the cache entry * @avd: the AVD to cache * @audited: the permission audit bitmask to cache * * Update the AVD cache in @tsec with the @avdc and @audited info associated * with @isec. */ static inline void task_avdcache_update(struct task_security_struct *tsec, struct inode_security_struct *isec, struct av_decision *avd, u32 audited) { int spot; /* focused on path walk optimization, only cache directories */ if (isec->sclass != SECCLASS_DIR) return; /* update cache */ spot = (tsec->avdcache.dir_spot + 1) & (TSEC_AVDC_DIR_SIZE - 1); tsec->avdcache.dir_spot = spot; tsec->avdcache.dir[spot].isid = isec->sid; tsec->avdcache.dir[spot].audited = audited; tsec->avdcache.dir[spot].allowed = avd->allowed; tsec->avdcache.dir[spot].permissive = avd->flags & AVD_FLAGS_PERMISSIVE; tsec->avdcache.permissive_neveraudit = (avd->flags == (AVD_FLAGS_PERMISSIVE|AVD_FLAGS_NEVERAUDIT)); } /** * selinux_inode_permission - Check if the current task can access an inode * @inode: the inode that is being accessed * @requested: the accesses being requested * * Check if the current task is allowed to access @inode according to * @requested. Returns 0 if allowed, negative values otherwise. */ static int selinux_inode_permission(struct inode *inode, int requested) { int mask; u32 perms; u32 sid = current_sid(); struct task_security_struct *tsec; struct inode_security_struct *isec; struct avdc_entry *avdc; int rc, rc2; u32 audited, denied; mask = requested & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND); /* No permission to check. Existence test. */ if (!mask) return 0; tsec = selinux_task(current); if (task_avdcache_permnoaudit(tsec, sid)) return 0; isec = inode_security_rcu(inode, requested & MAY_NOT_BLOCK); if (IS_ERR(isec)) return PTR_ERR(isec); perms = file_mask_to_av(inode->i_mode, mask); rc = task_avdcache_search(tsec, isec, &avdc); if (likely(!rc)) { /* Cache hit. */ audited = perms & avdc->audited; denied = perms & ~avdc->allowed; if (unlikely(denied && enforcing_enabled() && !avdc->permissive)) rc = -EACCES; } else { struct av_decision avd; /* Cache miss. */ rc = avc_has_perm_noaudit(sid, isec->sid, isec->sclass, perms, 0, &avd); audited = avc_audit_required(perms, &avd, rc, (requested & MAY_ACCESS) ? FILE__AUDIT_ACCESS : 0, &denied); task_avdcache_update(tsec, isec, &avd, audited); } if (likely(!audited)) return rc; rc2 = audit_inode_permission(inode, perms, audited, denied, rc); if (rc2) return rc2; return rc; } static int selinux_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { const struct cred *cred = current_cred(); struct inode *inode = d_backing_inode(dentry); unsigned int ia_valid = iattr->ia_valid; u32 av = FILE__WRITE; /* ATTR_FORCE is just used for ATTR_KILL_S[UG]ID. */ if (ia_valid & ATTR_FORCE) { ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE | ATTR_FORCE); if (!ia_valid) return 0; } if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET)) return dentry_has_perm(cred, dentry, FILE__SETATTR); if (selinux_policycap_openperm() && inode->i_sb->s_magic != SOCKFS_MAGIC && (ia_valid & ATTR_SIZE) && !(ia_valid & ATTR_FILE)) av |= FILE__OPEN; return dentry_has_perm(cred, dentry, av); } static int selinux_inode_getattr(const struct path *path) { struct task_security_struct *tsec; tsec = selinux_task(current); if (task_avdcache_permnoaudit(tsec, current_sid())) return 0; return path_has_perm(current_cred(), path, FILE__GETATTR); } static bool has_cap_mac_admin(bool audit) { const struct cred *cred = current_cred(); unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT; if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts)) return false; if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true)) return false; return true; } /** * selinux_inode_xattr_skipcap - Skip the xattr capability checks? * @name: name of the xattr * * Returns 1 to indicate that SELinux "owns" the access control rights to xattrs * named @name; the LSM layer should avoid enforcing any traditional * capability based access controls on this xattr. Returns 0 to indicate that * SELinux does not "own" the access control rights to xattrs named @name and is * deferring to the LSM layer for further access controls, including capability * based controls. */ static int selinux_inode_xattr_skipcap(const char *name) { /* require capability check if not a selinux xattr */ return !strcmp(name, XATTR_NAME_SELINUX); } static int selinux_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec; struct superblock_security_struct *sbsec; struct common_audit_data ad; u32 newsid, sid = current_sid(); int rc = 0; /* if not a selinux xattr, only check the ordinary setattr perm */ if (strcmp(name, XATTR_NAME_SELINUX)) return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); if (!selinux_initialized()) return (inode_owner_or_capable(idmap, inode) ? 0 : -EPERM); sbsec = selinux_superblock(inode->i_sb); if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; isec = backing_inode_security(dentry); rc = avc_has_perm(sid, isec->sid, isec->sclass, FILE__RELABELFROM, &ad); if (rc) return rc; rc = security_context_to_sid(value, size, &newsid, GFP_KERNEL); if (rc == -EINVAL) { if (!has_cap_mac_admin(true)) { struct audit_buffer *ab; size_t audit_size; /* We strip a nul only if it is at the end, otherwise the * context contains a nul and we should audit that */ if (value) { const char *str = value; if (str[size - 1] == '\0') audit_size = size - 1; else audit_size = size; } else { audit_size = 0; } ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); if (!ab) return rc; audit_log_format(ab, "op=setxattr invalid_context="); audit_log_n_untrustedstring(ab, value, audit_size); audit_log_end(ab); return rc; } rc = security_context_to_sid_force(value, size, &newsid); } if (rc) return rc; rc = avc_has_perm(sid, newsid, isec->sclass, FILE__RELABELTO, &ad); if (rc) return rc; rc = security_validate_transition(isec->sid, newsid, sid, isec->sclass); if (rc) return rc; return avc_has_perm(newsid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, &ad); } static int selinux_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } static int selinux_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return dentry_has_perm(current_cred(), dentry, FILE__GETATTR); } static int selinux_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec; u32 newsid; int rc; if (strcmp(name, XATTR_NAME_SELINUX)) { /* Not an attribute we recognize, so nothing to do. */ return; } if (!selinux_initialized()) { /* If we haven't even been initialized, then we can't validate * against a policy, so leave the label as invalid. It may * resolve to a valid label on the next revalidation try if * we've since initialized. */ return; } rc = security_context_to_sid_force(value, size, &newsid); if (rc) { pr_err("SELinux: unable to map context to SID" "for (%s, %lu), rc=%d\n", inode->i_sb->s_id, inode->i_ino, -rc); return; } isec = backing_inode_security(dentry); spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); } static int selinux_inode_getxattr(struct dentry *dentry, const char *name) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__GETATTR); } static int selinux_inode_listxattr(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__GETATTR); } static int selinux_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { /* if not a selinux xattr, only check the ordinary setattr perm */ if (strcmp(name, XATTR_NAME_SELINUX)) return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); if (!selinux_initialized()) return 0; /* No one is allowed to remove a SELinux security label. You can change the label, but all data must be labeled. */ return -EACCES; } static int selinux_inode_file_setattr(struct dentry *dentry, struct file_kattr *fa) { return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } static int selinux_inode_file_getattr(struct dentry *dentry, struct file_kattr *fa) { return dentry_has_perm(current_cred(), dentry, FILE__GETATTR); } static int selinux_path_notify(const struct path *path, u64 mask, unsigned int obj_type) { int ret; u32 perm; struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = *path; /* * Set permission needed based on the type of mark being set. * Performs an additional check for sb watches. */ switch (obj_type) { case FSNOTIFY_OBJ_TYPE_VFSMOUNT: perm = FILE__WATCH_MOUNT; break; case FSNOTIFY_OBJ_TYPE_SB: perm = FILE__WATCH_SB; ret = superblock_has_perm(current_cred(), path->dentry->d_sb, FILESYSTEM__WATCH, &ad); if (ret) return ret; break; case FSNOTIFY_OBJ_TYPE_INODE: perm = FILE__WATCH; break; case FSNOTIFY_OBJ_TYPE_MNTNS: perm = FILE__WATCH_MOUNTNS; break; default: return -EINVAL; } /* blocking watches require the file:watch_with_perm permission */ if (mask & (ALL_FSNOTIFY_PERM_EVENTS)) perm |= FILE__WATCH_WITH_PERM; /* watches on read-like events need the file:watch_reads permission */ if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_PRE_ACCESS | FS_CLOSE_NOWRITE)) perm |= FILE__WATCH_READS; return path_has_perm(current_cred(), path, perm); } /* * Copy the inode security context value to the user. * * Permission check is handled by selinux_inode_getxattr hook. */ static int selinux_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { u32 size; int error; char *context = NULL; struct inode_security_struct *isec; /* * If we're not initialized yet, then we can't validate contexts, so * just let vfs_getxattr fall back to using the on-disk xattr. */ if (!selinux_initialized() || strcmp(name, XATTR_SELINUX_SUFFIX)) return -EOPNOTSUPP; /* * If the caller has CAP_MAC_ADMIN, then get the raw context * value even if it is not defined by current policy; otherwise, * use the in-core value under current policy. * Use the non-auditing forms of the permission checks since * getxattr may be called by unprivileged processes commonly * and lack of permission just means that we fall back to the * in-core context value, not a denial. */ isec = inode_security(inode); if (has_cap_mac_admin(false)) error = security_sid_to_context_force(isec->sid, &context, &size); else error = security_sid_to_context(isec->sid, &context, &size); if (error) return error; error = size; if (alloc) { *buffer = context; goto out_nofree; } kfree(context); out_nofree: return error; } static int selinux_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct inode_security_struct *isec = inode_security_novalidate(inode); struct superblock_security_struct *sbsec; u32 newsid; int rc; if (strcmp(name, XATTR_SELINUX_SUFFIX)) return -EOPNOTSUPP; sbsec = selinux_superblock(inode->i_sb); if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (!value || !size) return -EACCES; rc = security_context_to_sid(value, size, &newsid, GFP_KERNEL); if (rc) return rc; spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); return 0; } static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) { const int len = sizeof(XATTR_NAME_SELINUX); if (!selinux_initialized()) return 0; if (buffer && len <= buffer_size) memcpy(buffer, XATTR_NAME_SELINUX, len); return len; } static void selinux_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop) { struct inode_security_struct *isec = inode_security_novalidate(inode); prop->selinux.secid = isec->sid; } static int selinux_inode_copy_up(struct dentry *src, struct cred **new) { struct lsm_prop prop; struct cred_security_struct *crsec; struct cred *new_creds = *new; if (new_creds == NULL) { new_creds = prepare_creds(); if (!new_creds) return -ENOMEM; } crsec = selinux_cred(new_creds); /* Get label from overlay inode and set it in create_sid */ selinux_inode_getlsmprop(d_inode(src), &prop); crsec->create_sid = prop.selinux.secid; *new = new_creds; return 0; } static int selinux_inode_copy_up_xattr(struct dentry *dentry, const char *name) { /* The copy_up hook above sets the initial context on an inode, but we * don't then want to overwrite it by blindly copying all the lower * xattrs up. Instead, filter out SELinux-related xattrs following * policy load. */ if (selinux_initialized() && !strcmp(name, XATTR_NAME_SELINUX)) return -ECANCELED; /* Discard */ /* * Any other attribute apart from SELINUX is not claimed, supported * by selinux. */ return -EOPNOTSUPP; } /* kernfs node operations */ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { const struct cred_security_struct *crsec = selinux_cred(current_cred()); u32 parent_sid, newsid, clen; int rc; char *context; rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, NULL, 0); if (rc == -ENODATA) return 0; else if (rc < 0) return rc; clen = (u32)rc; context = kmalloc(clen, GFP_KERNEL); if (!context) return -ENOMEM; rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, context, clen); if (rc < 0) { kfree(context); return rc; } rc = security_context_to_sid(context, clen, &parent_sid, GFP_KERNEL); kfree(context); if (rc) return rc; if (crsec->create_sid) { newsid = crsec->create_sid; } else { u16 secclass = inode_mode_to_security_class(kn->mode); const char *kn_name; struct qstr q; /* kn is fresh, can't be renamed, name goes not away */ kn_name = rcu_dereference_check(kn->name, true); q.name = kn_name; q.hash_len = hashlen_string(kn_dir, kn_name); rc = security_transition_sid(crsec->sid, parent_sid, secclass, &q, &newsid); if (rc) return rc; } rc = security_sid_to_context_force(newsid, &context, &clen); if (rc) return rc; rc = kernfs_xattr_set(kn, XATTR_NAME_SELINUX, context, clen, XATTR_CREATE); kfree(context); return rc; } /* file security operations */ static int selinux_revalidate_file_permission(struct file *file, int mask) { const struct cred *cred = current_cred(); struct inode *inode = file_inode(file); /* file_mask_to_av won't add FILE__WRITE if MAY_APPEND is set */ if ((file->f_flags & O_APPEND) && (mask & MAY_WRITE)) mask |= MAY_APPEND; return file_has_perm(cred, file, file_mask_to_av(inode->i_mode, mask)); } static int selinux_file_permission(struct file *file, int mask) { struct inode *inode = file_inode(file); struct file_security_struct *fsec = selinux_file(file); struct inode_security_struct *isec; u32 sid = current_sid(); if (!mask) /* No permission to check. Existence test. */ return 0; isec = inode_security(inode); if (sid == fsec->sid && fsec->isid == isec->sid && fsec->pseqno == avc_policy_seqno()) /* No change since file_open check. */ return 0; return selinux_revalidate_file_permission(file, mask); } static int selinux_file_alloc_security(struct file *file) { struct file_security_struct *fsec = selinux_file(file); u32 sid = current_sid(); fsec->sid = sid; fsec->fown_sid = sid; return 0; } /* * Check whether a task has the ioctl permission and cmd * operation to an inode. */ static int ioctl_has_perm(const struct cred *cred, struct file *file, u32 requested, u16 cmd) { struct common_audit_data ad; struct file_security_struct *fsec = selinux_file(file); struct inode *inode = file_inode(file); struct inode_security_struct *isec; struct lsm_ioctlop_audit ioctl; u32 ssid = cred_sid(cred); int rc; u8 driver = cmd >> 8; u8 xperm = cmd & 0xff; ad.type = LSM_AUDIT_DATA_IOCTL_OP; ad.u.op = &ioctl; ad.u.op->cmd = cmd; ad.u.op->path = file->f_path; if (ssid != fsec->sid) { rc = avc_has_perm(ssid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) goto out; } if (unlikely(IS_PRIVATE(inode))) return 0; isec = inode_security(inode); rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, requested, driver, AVC_EXT_IOCTL, xperm, &ad); out: return rc; } static int selinux_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { const struct cred *cred = current_cred(); int error = 0; switch (cmd) { case FIONREAD: case FIBMAP: case FIGETBSZ: case FS_IOC_GETFLAGS: case FS_IOC_GETVERSION: error = file_has_perm(cred, file, FILE__GETATTR); break; case FS_IOC_SETFLAGS: case FS_IOC_SETVERSION: error = file_has_perm(cred, file, FILE__SETATTR); break; /* sys_ioctl() checks */ case FIONBIO: case FIOASYNC: error = file_has_perm(cred, file, 0); break; case KDSKBENT: case KDSKBSENT: error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG, CAP_OPT_NONE, true); break; case FIOCLEX: case FIONCLEX: if (!selinux_policycap_ioctl_skip_cloexec()) error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd); break; /* default case assumes that the command will go * to the file's ioctl() function. */ default: error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd); } return error; } static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { /* * If we are in a 64-bit kernel running 32-bit userspace, we need to * make sure we don't compare 32-bit flags to 64-bit flags. */ switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; case FS_IOC32_SETVERSION: cmd = FS_IOC_SETVERSION; break; default: break; } return selinux_file_ioctl(file, cmd, arg); } static int default_noexec __ro_after_init; static int file_map_prot_check(struct file *file, unsigned long prot, int shared) { const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); int rc = 0; if (default_noexec && (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) || (!shared && (prot & PROT_WRITE)))) { /* * We are making executable an anonymous mapping or a * private file mapping that will also be writable. * This has an additional check. */ rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECMEM, NULL); if (rc) goto error; } if (file) { /* read access is always possible with a mapping */ u32 av = FILE__READ; /* write access only matters if the mapping is shared */ if (shared && (prot & PROT_WRITE)) av |= FILE__WRITE; if (prot & PROT_EXEC) av |= FILE__EXECUTE; return file_has_perm(cred, file, av); } error: return rc; } static int selinux_mmap_addr(unsigned long addr) { int rc = 0; if (addr < CONFIG_LSM_MMAP_MIN_ADDR) { u32 sid = current_sid(); rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, NULL); } return rc; } static int selinux_mmap_file(struct file *file, unsigned long reqprot __always_unused, unsigned long prot, unsigned long flags) { struct common_audit_data ad; int rc; if (file) { ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; rc = inode_has_perm(current_cred(), file_inode(file), FILE__MAP, &ad); if (rc) return rc; } return file_map_prot_check(file, prot, (flags & MAP_TYPE) == MAP_SHARED); } static int selinux_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot __always_unused, unsigned long prot) { const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); if (default_noexec && (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) { int rc = 0; /* * We don't use the vma_is_initial_heap() helper as it has * a history of problems and is currently broken on systems * where there is no heap, e.g. brk == start_brk. Before * replacing the conditional below with vma_is_initial_heap(), * or something similar, please ensure that the logic is the * same as what we have below or you have tested every possible * corner case you can think to test. */ if (vma->vm_start >= vma->vm_mm->start_brk && vma->vm_end <= vma->vm_mm->brk) { rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECHEAP, NULL); } else if (!vma->vm_file && (vma_is_initial_stack(vma) || vma_is_stack_for_current(vma))) { rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECSTACK, NULL); } else if (vma->vm_file && vma->anon_vma) { /* * We are making executable a file mapping that has * had some COW done. Since pages might have been * written, check ability to execute the possibly * modified content. This typically should only * occur for text relocations. */ rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD); } if (rc) return rc; } return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED); } static int selinux_file_lock(struct file *file, unsigned int cmd) { const struct cred *cred = current_cred(); return file_has_perm(cred, file, FILE__LOCK); } static int selinux_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { const struct cred *cred = current_cred(); int err = 0; switch (cmd) { case F_SETFL: if ((file->f_flags & O_APPEND) && !(arg & O_APPEND)) { err = file_has_perm(cred, file, FILE__WRITE); break; } fallthrough; case F_SETOWN: case F_SETSIG: case F_GETFL: case F_GETOWN: case F_GETSIG: case F_GETOWNER_UIDS: /* Just check FD__USE permission */ err = file_has_perm(cred, file, 0); break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: #if BITS_PER_LONG == 32 case F_GETLK64: case F_SETLK64: case F_SETLKW64: #endif err = file_has_perm(cred, file, FILE__LOCK); break; } return err; } static void selinux_file_set_fowner(struct file *file) { struct file_security_struct *fsec; fsec = selinux_file(file); fsec->fown_sid = current_sid(); } static int selinux_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int signum) { struct file *file; u32 sid = task_sid_obj(tsk); u32 perm; struct file_security_struct *fsec; /* struct fown_struct is never outside the context of a struct file */ file = fown->file; fsec = selinux_file(file); if (!signum) perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */ else perm = signal_to_av(signum); return avc_has_perm(fsec->fown_sid, sid, SECCLASS_PROCESS, perm, NULL); } static int selinux_file_receive(struct file *file) { const struct cred *cred = current_cred(); return file_has_perm(cred, file, file_to_av(file)); } static int selinux_file_open(struct file *file) { struct file_security_struct *fsec; struct inode_security_struct *isec; fsec = selinux_file(file); isec = inode_security(file_inode(file)); /* * Save inode label and policy sequence number * at open-time so that selinux_file_permission * can determine whether revalidation is necessary. * Task label is already saved in the file security * struct as its SID. */ fsec->isid = isec->sid; fsec->pseqno = avc_policy_seqno(); /* * Since the inode label or policy seqno may have changed * between the selinux_inode_permission check and the saving * of state above, recheck that access is still permitted. * Otherwise, access might never be revalidated against the * new inode label or new policy. * This check is not redundant - do not remove. */ return file_path_has_perm(file->f_cred, file, open_file_to_av(file)); } /* task security operations */ static int selinux_task_alloc(struct task_struct *task, u64 clone_flags) { u32 sid = current_sid(); struct task_security_struct *old_tsec = selinux_task(current); struct task_security_struct *new_tsec = selinux_task(task); *new_tsec = *old_tsec; return avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL); } /* * prepare a new set of credentials for modification */ static int selinux_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { const struct cred_security_struct *old_crsec = selinux_cred(old); struct cred_security_struct *crsec = selinux_cred(new); *crsec = *old_crsec; return 0; } /* * transfer the SELinux data to a blank set of creds */ static void selinux_cred_transfer(struct cred *new, const struct cred *old) { const struct cred_security_struct *old_crsec = selinux_cred(old); struct cred_security_struct *crsec = selinux_cred(new); *crsec = *old_crsec; } static void selinux_cred_getsecid(const struct cred *c, u32 *secid) { *secid = cred_sid(c); } static void selinux_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop) { prop->selinux.secid = cred_sid(c); } /* * set the security data for a kernel service * - all the creation contexts are set to unlabelled */ static int selinux_kernel_act_as(struct cred *new, u32 secid) { struct cred_security_struct *crsec = selinux_cred(new); u32 sid = current_sid(); int ret; ret = avc_has_perm(sid, secid, SECCLASS_KERNEL_SERVICE, KERNEL_SERVICE__USE_AS_OVERRIDE, NULL); if (ret == 0) { crsec->sid = secid; crsec->create_sid = 0; crsec->keycreate_sid = 0; crsec->sockcreate_sid = 0; } return ret; } /* * set the file creation context in a security record to the same as the * objective context of the specified inode */ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) { struct inode_security_struct *isec = inode_security(inode); struct cred_security_struct *crsec = selinux_cred(new); u32 sid = current_sid(); int ret; ret = avc_has_perm(sid, isec->sid, SECCLASS_KERNEL_SERVICE, KERNEL_SERVICE__CREATE_FILES_AS, NULL); if (ret == 0) crsec->create_sid = isec->sid; return ret; } static int selinux_kernel_module_request(char *kmod_name) { struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_KMOD; ad.u.kmod_name = kmod_name; return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__MODULE_REQUEST, &ad); } static int selinux_kernel_load_from_file(struct file *file, u32 requested) { struct common_audit_data ad; struct inode_security_struct *isec; struct file_security_struct *fsec; u32 sid = current_sid(); int rc; if (file == NULL) return avc_has_perm(sid, sid, SECCLASS_SYSTEM, requested, NULL); ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; fsec = selinux_file(file); if (sid != fsec->sid) { rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) return rc; } isec = inode_security(file_inode(file)); return avc_has_perm(sid, isec->sid, SECCLASS_SYSTEM, requested, &ad); } static int selinux_kernel_read_file(struct file *file, enum kernel_read_file_id id, bool contents) { int rc = 0; BUILD_BUG_ON_MSG(READING_MAX_ID > 8, "New kernel_read_file_id introduced; update SELinux!"); switch (id) { case READING_FIRMWARE: rc = selinux_kernel_load_from_file(file, SYSTEM__FIRMWARE_LOAD); break; case READING_MODULE: case READING_MODULE_COMPRESSED: rc = selinux_kernel_load_from_file(file, SYSTEM__MODULE_LOAD); break; case READING_KEXEC_IMAGE: rc = selinux_kernel_load_from_file(file, SYSTEM__KEXEC_IMAGE_LOAD); break; case READING_KEXEC_INITRAMFS: rc = selinux_kernel_load_from_file(file, SYSTEM__KEXEC_INITRAMFS_LOAD); break; case READING_POLICY: rc = selinux_kernel_load_from_file(file, SYSTEM__POLICY_LOAD); break; case READING_X509_CERTIFICATE: rc = selinux_kernel_load_from_file(file, SYSTEM__X509_CERTIFICATE_LOAD); break; default: break; } return rc; } static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents) { int rc = 0; BUILD_BUG_ON_MSG(LOADING_MAX_ID > 8, "New kernel_load_data_id introduced; update SELinux!"); switch (id) { case LOADING_FIRMWARE: rc = selinux_kernel_load_from_file(NULL, SYSTEM__FIRMWARE_LOAD); break; case LOADING_MODULE: rc = selinux_kernel_load_from_file(NULL, SYSTEM__MODULE_LOAD); break; case LOADING_KEXEC_IMAGE: rc = selinux_kernel_load_from_file(NULL, SYSTEM__KEXEC_IMAGE_LOAD); break; case LOADING_KEXEC_INITRAMFS: rc = selinux_kernel_load_from_file(NULL, SYSTEM__KEXEC_INITRAMFS_LOAD); break; case LOADING_POLICY: rc = selinux_kernel_load_from_file(NULL, SYSTEM__POLICY_LOAD); break; case LOADING_X509_CERTIFICATE: rc = selinux_kernel_load_from_file(NULL, SYSTEM__X509_CERTIFICATE_LOAD); break; default: break; } return rc; } static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETPGID, NULL); } static int selinux_task_getpgid(struct task_struct *p) { return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETPGID, NULL); } static int selinux_task_getsid(struct task_struct *p) { return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETSESSION, NULL); } static void selinux_current_getlsmprop_subj(struct lsm_prop *prop) { prop->selinux.secid = current_sid(); } static void selinux_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop) { prop->selinux.secid = task_sid_obj(p); } static int selinux_task_setnice(struct task_struct *p, int nice) { return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_setioprio(struct task_struct *p, int ioprio) { return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_getioprio(struct task_struct *p) { return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETSCHED, NULL); } static int selinux_task_prlimit(const struct cred *cred, const struct cred *tcred, unsigned int flags) { u32 av = 0; if (!flags) return 0; if (flags & LSM_PRLIMIT_WRITE) av |= PROCESS__SETRLIMIT; if (flags & LSM_PRLIMIT_READ) av |= PROCESS__GETRLIMIT; return avc_has_perm(cred_sid(cred), cred_sid(tcred), SECCLASS_PROCESS, av, NULL); } static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) { struct rlimit *old_rlim = p->signal->rlim + resource; /* Control the ability to change the hard limit (whether lowering or raising it), so that the hard limit can later be used as a safe reset point for the soft limit upon context transitions. See selinux_bprm_committing_creds. */ if (old_rlim->rlim_max != new_rlim->rlim_max) return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETRLIMIT, NULL); return 0; } static int selinux_task_setscheduler(struct task_struct *p) { return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_getscheduler(struct task_struct *p) { return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETSCHED, NULL); } static int selinux_task_movememory(struct task_struct *p) { return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { u32 secid; u32 perm; if (!sig) perm = PROCESS__SIGNULL; /* null signal; existence test */ else perm = signal_to_av(sig); if (!cred) secid = current_sid(); else secid = cred_sid(cred); return avc_has_perm(secid, task_sid_obj(p), SECCLASS_PROCESS, perm, NULL); } static void selinux_task_to_inode(struct task_struct *p, struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); u32 sid = task_sid_obj(p); spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = sid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); } static int selinux_userns_create(const struct cred *cred) { u32 sid = current_sid(); return avc_has_perm(sid, sid, SECCLASS_USER_NAMESPACE, USER_NAMESPACE__CREATE, NULL); } /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv4(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int offset, ihlen, ret = -EINVAL; struct iphdr _iph, *ih; offset = skb_network_offset(skb); ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) goto out; ihlen = ih->ihl * 4; if (ihlen < sizeof(_iph)) goto out; ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; ret = 0; if (proto) *proto = ih->protocol; switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } #endif default: break; } out: return ret; } #if IS_ENABLED(CONFIG_IPV6) /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv6(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { u8 nexthdr; int ret = -EINVAL, offset; struct ipv6hdr _ipv6h, *ip6; __be16 frag_off; offset = skb_network_offset(skb); ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (ip6 == NULL) goto out; ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; ret = 0; nexthdr = ip6->nexthdr; offset += sizeof(_ipv6h); offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) goto out; if (proto) *proto = nexthdr; switch (nexthdr) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } #endif /* includes fragments */ default: break; } out: return ret; } #endif /* IPV6 */ static int selinux_parse_skb(struct sk_buff *skb, struct common_audit_data *ad, char **_addrp, int src, u8 *proto) { char *addrp; int ret; switch (ad->u.net->family) { case PF_INET: ret = selinux_parse_skb_ipv4(skb, ad, proto); if (ret) goto parse_error; addrp = (char *)(src ? &ad->u.net->v4info.saddr : &ad->u.net->v4info.daddr); goto okay; #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: ret = selinux_parse_skb_ipv6(skb, ad, proto); if (ret) goto parse_error; addrp = (char *)(src ? &ad->u.net->v6info.saddr : &ad->u.net->v6info.daddr); goto okay; #endif /* IPV6 */ default: addrp = NULL; goto okay; } parse_error: pr_warn( "SELinux: failure in selinux_parse_skb()," " unable to parse packet\n"); return ret; okay: if (_addrp) *_addrp = addrp; return 0; } /** * selinux_skb_peerlbl_sid - Determine the peer label of a packet * @skb: the packet * @family: protocol family * @sid: the packet's peer label SID * * Description: * Check the various different forms of network peer labeling and determine * the peer label/SID for the packet; most of the magic actually occurs in * the security server function security_net_peersid_cmp(). The function * returns zero if the value in @sid is valid (although it may be SECSID_NULL) * or -EACCES if @sid is invalid due to inconsistencies with the different * peer labels. * */ static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid) { int err; u32 xfrm_sid; u32 nlbl_sid; u32 nlbl_type; err = selinux_xfrm_skb_sid(skb, &xfrm_sid); if (unlikely(err)) return -EACCES; err = selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid); if (unlikely(err)) return -EACCES; err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid); if (unlikely(err)) { pr_warn( "SELinux: failure in selinux_skb_peerlbl_sid()," " unable to determine packet's peer label\n"); return -EACCES; } return 0; } /** * selinux_conn_sid - Determine the child socket label for a connection * @sk_sid: the parent socket's SID * @skb_sid: the packet's SID * @conn_sid: the resulting connection SID * * If @skb_sid is valid then the user:role:type information from @sk_sid is * combined with the MLS information from @skb_sid in order to create * @conn_sid. If @skb_sid is not valid then @conn_sid is simply a copy * of @sk_sid. Returns zero on success, negative values on failure. * */ static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid) { int err = 0; if (skb_sid != SECSID_NULL) err = security_sid_mls_copy(sk_sid, skb_sid, conn_sid); else *conn_sid = sk_sid; return err; } /* socket security operations */ static int socket_sockcreate_sid(const struct cred_security_struct *crsec, u16 secclass, u32 *socksid) { if (crsec->sockcreate_sid > SECSID_NULL) { *socksid = crsec->sockcreate_sid; return 0; } return security_transition_sid(crsec->sid, crsec->sid, secclass, NULL, socksid); } static bool sock_skip_has_perm(u32 sid) { if (sid == SECINITSID_KERNEL) return true; /* * Before POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT, sockets that * inherited the kernel context from early boot used to be skipped * here, so preserve that behavior unless the capability is set. * * By setting the capability the policy signals that it is ready * for this quirk to be fixed. Note that sockets created by a kernel * thread or a usermode helper executed without a transition will * still be skipped in this check regardless of the policycap * setting. */ if (!selinux_policycap_userspace_initial_context() && sid == SECINITSID_INIT) return true; return false; } static int sock_has_perm(struct sock *sk, u32 perms) { struct sk_security_struct *sksec = sk->sk_security; struct common_audit_data ad; struct lsm_network_audit net; if (sock_skip_has_perm(sksec->sid)) return 0; ad_net_init_from_sk(&ad, &net, sk); return avc_has_perm(current_sid(), sksec->sid, sksec->sclass, perms, &ad); } static int selinux_socket_create(int family, int type, int protocol, int kern) { const struct cred_security_struct *crsec = selinux_cred(current_cred()); u32 newsid; u16 secclass; int rc; if (kern) return 0; secclass = socket_type_to_security_class(family, type, protocol); rc = socket_sockcreate_sid(crsec, secclass, &newsid); if (rc) return rc; return avc_has_perm(crsec->sid, newsid, secclass, SOCKET__CREATE, NULL); } static int selinux_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { const struct cred_security_struct *crsec = selinux_cred(current_cred()); struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock)); struct sk_security_struct *sksec; u16 sclass = socket_type_to_security_class(family, type, protocol); u32 sid = SECINITSID_KERNEL; int err = 0; if (!kern) { err = socket_sockcreate_sid(crsec, sclass, &sid); if (err) return err; } isec->sclass = sclass; isec->sid = sid; isec->initialized = LABEL_INITIALIZED; if (sock->sk) { sksec = selinux_sock(sock->sk); sksec->sclass = sclass; sksec->sid = sid; /* Allows detection of the first association on this socket */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) sksec->sctp_assoc_state = SCTP_ASSOC_UNSET; err = selinux_netlbl_socket_post_create(sock->sk, family); } return err; } static int selinux_socket_socketpair(struct socket *socka, struct socket *sockb) { struct sk_security_struct *sksec_a = selinux_sock(socka->sk); struct sk_security_struct *sksec_b = selinux_sock(sockb->sk); sksec_a->peer_sid = sksec_b->sid; sksec_b->peer_sid = sksec_a->sid; return 0; } /* Range of port numbers used to automatically bind. Need to determine whether we should perform a name_bind permission check between the socket and the port number. */ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { struct sock *sk = sock->sk; struct sk_security_struct *sksec = selinux_sock(sk); u16 family; int err; err = sock_has_perm(sk, SOCKET__BIND); if (err) goto out; /* If PF_INET or PF_INET6, check name_bind permission for the port. */ family = sk->sk_family; if (family == PF_INET || family == PF_INET6) { char *addrp; struct common_audit_data ad; struct lsm_network_audit net = {0,}; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; u16 family_sa; unsigned short snum; u32 sid, node_perm; /* * sctp_bindx(3) calls via selinux_sctp_bind_connect() * that validates multiple binding addresses. Because of this * need to check address->sa_family as it is possible to have * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET. */ if (addrlen < offsetofend(struct sockaddr, sa_family)) return -EINVAL; family_sa = address->sa_family; switch (family_sa) { case AF_UNSPEC: case AF_INET: if (addrlen < sizeof(struct sockaddr_in)) return -EINVAL; addr4 = (struct sockaddr_in *)address; if (family_sa == AF_UNSPEC) { if (family == PF_INET6) { /* Length check from inet6_bind_sk() */ if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; /* Family check from __inet6_bind() */ goto err_af; } /* see __inet_bind(), we only want to allow * AF_UNSPEC if the address is INADDR_ANY */ if (addr4->sin_addr.s_addr != htonl(INADDR_ANY)) goto err_af; family_sa = AF_INET; } snum = ntohs(addr4->sin_port); addrp = (char *)&addr4->sin_addr.s_addr; break; case AF_INET6: if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; addr6 = (struct sockaddr_in6 *)address; snum = ntohs(addr6->sin6_port); addrp = (char *)&addr6->sin6_addr.s6_addr; break; default: goto err_af; } ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sport = htons(snum); ad.u.net->family = family_sa; if (snum) { int low, high; inet_get_local_port_range(sock_net(sk), &low, &high); if (inet_port_requires_bind_service(sock_net(sk), snum) || snum < low || snum > high) { err = sel_netport_sid(sk->sk_protocol, snum, &sid); if (err) goto out; err = avc_has_perm(sksec->sid, sid, sksec->sclass, SOCKET__NAME_BIND, &ad); if (err) goto out; } } switch (sksec->sclass) { case SECCLASS_TCP_SOCKET: node_perm = TCP_SOCKET__NODE_BIND; break; case SECCLASS_UDP_SOCKET: node_perm = UDP_SOCKET__NODE_BIND; break; case SECCLASS_SCTP_SOCKET: node_perm = SCTP_SOCKET__NODE_BIND; break; default: node_perm = RAWIP_SOCKET__NODE_BIND; break; } err = sel_netnode_sid(addrp, family_sa, &sid); if (err) goto out; if (family_sa == AF_INET) ad.u.net->v4info.saddr = addr4->sin_addr.s_addr; else ad.u.net->v6info.saddr = addr6->sin6_addr; err = avc_has_perm(sksec->sid, sid, sksec->sclass, node_perm, &ad); if (err) goto out; } out: return err; err_af: /* Note that SCTP services expect -EINVAL, others -EAFNOSUPPORT. */ if (sk->sk_protocol == IPPROTO_SCTP) return -EINVAL; return -EAFNOSUPPORT; } /* This supports connect(2) and SCTP connect services such as sctp_connectx(3) * and sctp_sendmsg(3) as described in Documentation/security/SCTP.rst */ static int selinux_socket_connect_helper(struct socket *sock, struct sockaddr *address, int addrlen) { struct sock *sk = sock->sk; struct sk_security_struct *sksec = selinux_sock(sk); int err; err = sock_has_perm(sk, SOCKET__CONNECT); if (err) return err; if (addrlen < offsetofend(struct sockaddr, sa_family)) return -EINVAL; /* connect(AF_UNSPEC) has special handling, as it is a documented * way to disconnect the socket */ if (address->sa_family == AF_UNSPEC) return 0; /* * If a TCP or SCTP socket, check name_connect permission * for the port. */ if (sksec->sclass == SECCLASS_TCP_SOCKET || sksec->sclass == SECCLASS_SCTP_SOCKET) { struct common_audit_data ad; struct lsm_network_audit net = {0,}; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; unsigned short snum; u32 sid, perm; /* sctp_connectx(3) calls via selinux_sctp_bind_connect() * that validates multiple connect addresses. Because of this * need to check address->sa_family as it is possible to have * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET. */ switch (address->sa_family) { case AF_INET: addr4 = (struct sockaddr_in *)address; if (addrlen < sizeof(struct sockaddr_in)) return -EINVAL; snum = ntohs(addr4->sin_port); break; case AF_INET6: addr6 = (struct sockaddr_in6 *)address; if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; snum = ntohs(addr6->sin6_port); break; default: /* Note that SCTP services expect -EINVAL, whereas * others expect -EAFNOSUPPORT. */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) return -EINVAL; else return -EAFNOSUPPORT; } err = sel_netport_sid(sk->sk_protocol, snum, &sid); if (err) return err; switch (sksec->sclass) { case SECCLASS_TCP_SOCKET: perm = TCP_SOCKET__NAME_CONNECT; break; case SECCLASS_SCTP_SOCKET: perm = SCTP_SOCKET__NAME_CONNECT; break; } ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->dport = htons(snum); ad.u.net->family = address->sa_family; err = avc_has_perm(sksec->sid, sid, sksec->sclass, perm, &ad); if (err) return err; } return 0; } /* Supports connect(2), see comments in selinux_socket_connect_helper() */ static int selinux_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { int err; struct sock *sk = sock->sk; err = selinux_socket_connect_helper(sock, address, addrlen); if (err) return err; return selinux_netlbl_socket_connect(sk, address); } static int selinux_socket_listen(struct socket *sock, int backlog) { return sock_has_perm(sock->sk, SOCKET__LISTEN); } static int selinux_socket_accept(struct socket *sock, struct socket *newsock) { int err; struct inode_security_struct *isec; struct inode_security_struct *newisec; u16 sclass; u32 sid; err = sock_has_perm(sock->sk, SOCKET__ACCEPT); if (err) return err; isec = inode_security_novalidate(SOCK_INODE(sock)); spin_lock(&isec->lock); sclass = isec->sclass; sid = isec->sid; spin_unlock(&isec->lock); newisec = inode_security_novalidate(SOCK_INODE(newsock)); newisec->sclass = sclass; newisec->sid = sid; newisec->initialized = LABEL_INITIALIZED; return 0; } static int selinux_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return sock_has_perm(sock->sk, SOCKET__WRITE); } static int selinux_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return sock_has_perm(sock->sk, SOCKET__READ); } static int selinux_socket_getsockname(struct socket *sock) { return sock_has_perm(sock->sk, SOCKET__GETATTR); } static int selinux_socket_getpeername(struct socket *sock) { return sock_has_perm(sock->sk, SOCKET__GETATTR); } static int selinux_socket_setsockopt(struct socket *sock, int level, int optname) { int err; err = sock_has_perm(sock->sk, SOCKET__SETOPT); if (err) return err; return selinux_netlbl_socket_setsockopt(sock, level, optname); } static int selinux_socket_getsockopt(struct socket *sock, int level, int optname) { return sock_has_perm(sock->sk, SOCKET__GETOPT); } static int selinux_socket_shutdown(struct socket *sock, int how) { return sock_has_perm(sock->sk, SOCKET__SHUTDOWN); } static int selinux_socket_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { struct sk_security_struct *sksec_sock = selinux_sock(sock); struct sk_security_struct *sksec_other = selinux_sock(other); struct sk_security_struct *sksec_new = selinux_sock(newsk); struct common_audit_data ad; struct lsm_network_audit net; int err; ad_net_init_from_sk(&ad, &net, other); err = avc_has_perm(sksec_sock->sid, sksec_other->sid, sksec_other->sclass, UNIX_STREAM_SOCKET__CONNECTTO, &ad); if (err) return err; /* server child socket */ sksec_new->peer_sid = sksec_sock->sid; err = security_sid_mls_copy(sksec_other->sid, sksec_sock->sid, &sksec_new->sid); if (err) return err; /* connecting socket */ sksec_sock->peer_sid = sksec_new->sid; return 0; } static int selinux_socket_unix_may_send(struct socket *sock, struct socket *other) { struct sk_security_struct *ssec = selinux_sock(sock->sk); struct sk_security_struct *osec = selinux_sock(other->sk); struct common_audit_data ad; struct lsm_network_audit net; ad_net_init_from_sk(&ad, &net, other->sk); return avc_has_perm(ssec->sid, osec->sid, osec->sclass, SOCKET__SENDTO, &ad); } static int selinux_inet_sys_rcv_skb(struct net *ns, int ifindex, char *addrp, u16 family, u32 peer_sid, struct common_audit_data *ad) { int err; u32 if_sid; u32 node_sid; err = sel_netif_sid(ns, ifindex, &if_sid); if (err) return err; err = avc_has_perm(peer_sid, if_sid, SECCLASS_NETIF, NETIF__INGRESS, ad); if (err) return err; err = sel_netnode_sid(addrp, family, &node_sid); if (err) return err; return avc_has_perm(peer_sid, node_sid, SECCLASS_NODE, NODE__RECVFROM, ad); } static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, u16 family) { int err = 0; struct sk_security_struct *sksec = selinux_sock(sk); u32 sk_sid = sksec->sid; struct common_audit_data ad; struct lsm_network_audit net; char *addrp; ad_net_init_from_iif(&ad, &net, skb->skb_iif, family); err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); if (err) return err; if (selinux_secmark_enabled()) { err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, PACKET__RECV, &ad); if (err) return err; } err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad); if (err) return err; err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad); return err; } static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err, peerlbl_active, secmark_active; struct sk_security_struct *sksec = selinux_sock(sk); u16 family = sk->sk_family; u32 sk_sid = sksec->sid; struct common_audit_data ad; struct lsm_network_audit net; char *addrp; if (family != PF_INET && family != PF_INET6) return 0; /* Handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; /* If any sort of compatibility mode is enabled then handoff processing * to the selinux_sock_rcv_skb_compat() function to deal with the * special handling. We do this in an attempt to keep this function * as fast and as clean as possible. */ if (!selinux_policycap_netpeer()) return selinux_sock_rcv_skb_compat(sk, skb, family); secmark_active = selinux_secmark_enabled(); peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return 0; ad_net_init_from_iif(&ad, &net, skb->skb_iif, family); err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); if (err) return err; if (peerlbl_active) { u32 peer_sid; err = selinux_skb_peerlbl_sid(skb, family, &peer_sid); if (err) return err; err = selinux_inet_sys_rcv_skb(sock_net(sk), skb->skb_iif, addrp, family, peer_sid, &ad); if (err) { selinux_netlbl_err(skb, family, err, 0); return err; } err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER, PEER__RECV, &ad); if (err) { selinux_netlbl_err(skb, family, err, 0); return err; } } if (secmark_active) { err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, PACKET__RECV, &ad); if (err) return err; } return err; } static int selinux_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { int err = 0; char *scontext = NULL; u32 scontext_len; struct sk_security_struct *sksec = selinux_sock(sock->sk); u32 peer_sid = SECSID_NULL; if (sksec->sclass == SECCLASS_UNIX_STREAM_SOCKET || sksec->sclass == SECCLASS_TCP_SOCKET || sksec->sclass == SECCLASS_SCTP_SOCKET) peer_sid = sksec->peer_sid; if (peer_sid == SECSID_NULL) return -ENOPROTOOPT; err = security_sid_to_context(peer_sid, &scontext, &scontext_len); if (err) return err; if (scontext_len > len) { err = -ERANGE; goto out_len; } if (copy_to_sockptr(optval, scontext, scontext_len)) err = -EFAULT; out_len: if (copy_to_sockptr(optlen, &scontext_len, sizeof(scontext_len))) err = -EFAULT; kfree(scontext); return err; } static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { u32 peer_secid = SECSID_NULL; u16 family; if (skb && skb->protocol == htons(ETH_P_IP)) family = PF_INET; else if (skb && skb->protocol == htons(ETH_P_IPV6)) family = PF_INET6; else if (sock) family = sock->sk->sk_family; else { *secid = SECSID_NULL; return -EINVAL; } if (sock && family == PF_UNIX) { struct inode_security_struct *isec; isec = inode_security_novalidate(SOCK_INODE(sock)); peer_secid = isec->sid; } else if (skb) selinux_skb_peerlbl_sid(skb, family, &peer_secid); *secid = peer_secid; if (peer_secid == SECSID_NULL) return -ENOPROTOOPT; return 0; } static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) { struct sk_security_struct *sksec = selinux_sock(sk); sksec->peer_sid = SECINITSID_UNLABELED; sksec->sid = SECINITSID_UNLABELED; sksec->sclass = SECCLASS_SOCKET; selinux_netlbl_sk_security_reset(sksec); return 0; } static void selinux_sk_free_security(struct sock *sk) { struct sk_security_struct *sksec = selinux_sock(sk); selinux_netlbl_sk_security_free(sksec); } static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk) { struct sk_security_struct *sksec = selinux_sock(sk); struct sk_security_struct *newsksec = selinux_sock(newsk); newsksec->sid = sksec->sid; newsksec->peer_sid = sksec->peer_sid; newsksec->sclass = sksec->sclass; selinux_netlbl_sk_security_reset(newsksec); } static void selinux_sk_getsecid(const struct sock *sk, u32 *secid) { if (!sk) *secid = SECINITSID_ANY_SOCKET; else { const struct sk_security_struct *sksec = selinux_sock(sk); *secid = sksec->sid; } } static void selinux_sock_graft(struct sock *sk, struct socket *parent) { struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(parent)); struct sk_security_struct *sksec = selinux_sock(sk); if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 || sk->sk_family == PF_UNIX) isec->sid = sksec->sid; sksec->sclass = isec->sclass; } /* * Determines peer_secid for the asoc and updates socket's peer label * if it's the first association on the socket. */ static int selinux_sctp_process_new_assoc(struct sctp_association *asoc, struct sk_buff *skb) { struct sock *sk = asoc->base.sk; u16 family = sk->sk_family; struct sk_security_struct *sksec = selinux_sock(sk); struct common_audit_data ad; struct lsm_network_audit net; int err; /* handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; if (selinux_peerlbl_enabled()) { asoc->peer_secid = SECSID_NULL; /* This will return peer_sid = SECSID_NULL if there are * no peer labels, see security_net_peersid_resolve(). */ err = selinux_skb_peerlbl_sid(skb, family, &asoc->peer_secid); if (err) return err; if (asoc->peer_secid == SECSID_NULL) asoc->peer_secid = SECINITSID_UNLABELED; } else { asoc->peer_secid = SECINITSID_UNLABELED; } if (sksec->sctp_assoc_state == SCTP_ASSOC_UNSET) { sksec->sctp_assoc_state = SCTP_ASSOC_SET; /* Here as first association on socket. As the peer SID * was allowed by peer recv (and the netif/node checks), * then it is approved by policy and used as the primary * peer SID for getpeercon(3). */ sksec->peer_sid = asoc->peer_secid; } else if (sksec->peer_sid != asoc->peer_secid) { /* Other association peer SIDs are checked to enforce * consistency among the peer SIDs. */ ad_net_init_from_sk(&ad, &net, asoc->base.sk); err = avc_has_perm(sksec->peer_sid, asoc->peer_secid, sksec->sclass, SCTP_SOCKET__ASSOCIATION, &ad); if (err) return err; } return 0; } /* Called whenever SCTP receives an INIT or COOKIE ECHO chunk. This * happens on an incoming connect(2), sctp_connectx(3) or * sctp_sendmsg(3) (with no association already present). */ static int selinux_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { struct sk_security_struct *sksec = selinux_sock(asoc->base.sk); u32 conn_sid; int err; if (!selinux_policycap_extsockclass()) return 0; err = selinux_sctp_process_new_assoc(asoc, skb); if (err) return err; /* Compute the MLS component for the connection and store * the information in asoc. This will be used by SCTP TCP type * sockets and peeled off connections as they cause a new * socket to be generated. selinux_sctp_sk_clone() will then * plug this into the new socket. */ err = selinux_conn_sid(sksec->sid, asoc->peer_secid, &conn_sid); if (err) return err; asoc->secid = conn_sid; /* Set any NetLabel labels including CIPSO/CALIPSO options. */ return selinux_netlbl_sctp_assoc_request(asoc, skb); } /* Called when SCTP receives a COOKIE ACK chunk as the final * response to an association request (initited by us). */ static int selinux_sctp_assoc_established(struct sctp_association *asoc, struct sk_buff *skb) { struct sk_security_struct *sksec = selinux_sock(asoc->base.sk); if (!selinux_policycap_extsockclass()) return 0; /* Inherit secid from the parent socket - this will be picked up * by selinux_sctp_sk_clone() if the association gets peeled off * into a new socket. */ asoc->secid = sksec->sid; return selinux_sctp_process_new_assoc(asoc, skb); } /* Check if sctp IPv4/IPv6 addresses are valid for binding or connecting * based on their @optname. */ static int selinux_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen) { int len, err = 0, walk_size = 0; void *addr_buf; struct sockaddr *addr; struct socket *sock; if (!selinux_policycap_extsockclass()) return 0; /* Process one or more addresses that may be IPv4 or IPv6 */ sock = sk->sk_socket; addr_buf = address; while (walk_size < addrlen) { if (walk_size + sizeof(sa_family_t) > addrlen) return -EINVAL; addr = addr_buf; switch (addr->sa_family) { case AF_UNSPEC: case AF_INET: len = sizeof(struct sockaddr_in); break; case AF_INET6: len = sizeof(struct sockaddr_in6); break; default: return -EINVAL; } if (walk_size + len > addrlen) return -EINVAL; err = -EINVAL; switch (optname) { /* Bind checks */ case SCTP_PRIMARY_ADDR: case SCTP_SET_PEER_PRIMARY_ADDR: case SCTP_SOCKOPT_BINDX_ADD: err = selinux_socket_bind(sock, addr, len); break; /* Connect checks */ case SCTP_SOCKOPT_CONNECTX: case SCTP_PARAM_SET_PRIMARY: case SCTP_PARAM_ADD_IP: case SCTP_SENDMSG_CONNECT: err = selinux_socket_connect_helper(sock, addr, len); if (err) return err; /* As selinux_sctp_bind_connect() is called by the * SCTP protocol layer, the socket is already locked, * therefore selinux_netlbl_socket_connect_locked() * is called here. The situations handled are: * sctp_connectx(3), sctp_sendmsg(3), sendmsg(2), * whenever a new IP address is added or when a new * primary address is selected. * Note that an SCTP connect(2) call happens before * the SCTP protocol layer and is handled via * selinux_socket_connect(). */ err = selinux_netlbl_socket_connect_locked(sk, addr); break; } if (err) return err; addr_buf += len; walk_size += len; } return 0; } /* Called whenever a new socket is created by accept(2) or sctp_peeloff(3). */ static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { struct sk_security_struct *sksec = selinux_sock(sk); struct sk_security_struct *newsksec = selinux_sock(newsk); /* If policy does not support SECCLASS_SCTP_SOCKET then call * the non-sctp clone version. */ if (!selinux_policycap_extsockclass()) return selinux_sk_clone_security(sk, newsk); newsksec->sid = asoc->secid; newsksec->peer_sid = asoc->peer_secid; newsksec->sclass = sksec->sclass; selinux_netlbl_sctp_sk_clone(sk, newsk); } static int selinux_mptcp_add_subflow(struct sock *sk, struct sock *ssk) { struct sk_security_struct *ssksec = selinux_sock(ssk); struct sk_security_struct *sksec = selinux_sock(sk); ssksec->sclass = sksec->sclass; ssksec->sid = sksec->sid; /* replace the existing subflow label deleting the existing one * and re-recreating a new label using the updated context */ selinux_netlbl_sk_security_free(ssksec); return selinux_netlbl_socket_post_create(ssk, ssk->sk_family); } static int selinux_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct sk_security_struct *sksec = selinux_sock(sk); int err; u16 family = req->rsk_ops->family; u32 connsid; u32 peersid; err = selinux_skb_peerlbl_sid(skb, family, &peersid); if (err) return err; err = selinux_conn_sid(sksec->sid, peersid, &connsid); if (err) return err; req->secid = connsid; req->peer_secid = peersid; return selinux_netlbl_inet_conn_request(req, family); } static void selinux_inet_csk_clone(struct sock *newsk, const struct request_sock *req) { struct sk_security_struct *newsksec = selinux_sock(newsk); newsksec->sid = req->secid; newsksec->peer_sid = req->peer_secid; /* NOTE: Ideally, we should also get the isec->sid for the new socket in sync, but we don't have the isec available yet. So we will wait until sock_graft to do it, by which time it will have been created and available. */ /* We don't need to take any sort of lock here as we are the only * thread with access to newsksec */ selinux_netlbl_inet_csk_clone(newsk, req->rsk_ops->family); } static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb) { u16 family = sk->sk_family; struct sk_security_struct *sksec = selinux_sock(sk); /* handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid); } static int selinux_secmark_relabel_packet(u32 sid) { return avc_has_perm(current_sid(), sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL); } static void selinux_secmark_refcount_inc(void) { atomic_inc(&selinux_secmark_refcount); } static void selinux_secmark_refcount_dec(void) { atomic_dec(&selinux_secmark_refcount); } static void selinux_req_classify_flow(const struct request_sock *req, struct flowi_common *flic) { flic->flowic_secid = req->secid; } static int selinux_tun_dev_alloc_security(void *security) { struct tun_security_struct *tunsec = selinux_tun_dev(security); tunsec->sid = current_sid(); return 0; } static int selinux_tun_dev_create(void) { u32 sid = current_sid(); /* we aren't taking into account the "sockcreate" SID since the socket * that is being created here is not a socket in the traditional sense, * instead it is a private sock, accessible only to the kernel, and * representing a wide range of network traffic spanning multiple * connections unlike traditional sockets - check the TUN driver to * get a better understanding of why this socket is special */ return avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__CREATE, NULL); } static int selinux_tun_dev_attach_queue(void *security) { struct tun_security_struct *tunsec = selinux_tun_dev(security); return avc_has_perm(current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__ATTACH_QUEUE, NULL); } static int selinux_tun_dev_attach(struct sock *sk, void *security) { struct tun_security_struct *tunsec = selinux_tun_dev(security); struct sk_security_struct *sksec = selinux_sock(sk); /* we don't currently perform any NetLabel based labeling here and it * isn't clear that we would want to do so anyway; while we could apply * labeling without the support of the TUN user the resulting labeled * traffic from the other end of the connection would almost certainly * cause confusion to the TUN user that had no idea network labeling * protocols were being used */ sksec->sid = tunsec->sid; sksec->sclass = SECCLASS_TUN_SOCKET; return 0; } static int selinux_tun_dev_open(void *security) { struct tun_security_struct *tunsec = selinux_tun_dev(security); u32 sid = current_sid(); int err; err = avc_has_perm(sid, tunsec->sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__RELABELFROM, NULL); if (err) return err; err = avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__RELABELTO, NULL); if (err) return err; tunsec->sid = sid; return 0; } #ifdef CONFIG_NETFILTER static unsigned int selinux_ip_forward(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int ifindex; u16 family; char *addrp; u32 peer_sid; struct common_audit_data ad; struct lsm_network_audit net; int secmark_active, peerlbl_active; if (!selinux_policycap_netpeer()) return NF_ACCEPT; secmark_active = selinux_secmark_enabled(); peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return NF_ACCEPT; family = state->pf; if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0) return NF_DROP; ifindex = state->in->ifindex; ad_net_init_from_iif(&ad, &net, ifindex, family); if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0) return NF_DROP; if (peerlbl_active) { int err; err = selinux_inet_sys_rcv_skb(state->net, ifindex, addrp, family, peer_sid, &ad); if (err) { selinux_netlbl_err(skb, family, err, 1); return NF_DROP; } } if (secmark_active) if (avc_has_perm(peer_sid, skb->secmark, SECCLASS_PACKET, PACKET__FORWARD_IN, &ad)) return NF_DROP; if (netlbl_enabled()) /* we do this in the FORWARD path and not the POST_ROUTING * path because we want to make sure we apply the necessary * labeling before IPsec is applied so we can leverage AH * protection */ if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0) return NF_DROP; return NF_ACCEPT; } static unsigned int selinux_ip_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct sock *sk; u32 sid; if (!netlbl_enabled()) return NF_ACCEPT; /* we do this in the LOCAL_OUT path and not the POST_ROUTING path * because we want to make sure we apply the necessary labeling * before IPsec is applied so we can leverage AH protection */ sk = skb_to_full_sk(skb); if (sk) { struct sk_security_struct *sksec; if (sk_listener(sk)) /* if the socket is the listening state then this * packet is a SYN-ACK packet which means it needs to * be labeled based on the connection/request_sock and * not the parent socket. unfortunately, we can't * lookup the request_sock yet as it isn't queued on * the parent socket until after the SYN-ACK is sent. * the "solution" is to simply pass the packet as-is * as any IP option based labeling should be copied * from the initial connection request (in the IP * layer). it is far from ideal, but until we get a * security label in the packet itself this is the * best we can do. */ return NF_ACCEPT; /* standard practice, label using the parent socket */ sksec = selinux_sock(sk); sid = sksec->sid; } else sid = SECINITSID_KERNEL; if (selinux_netlbl_skbuff_setsid(skb, state->pf, sid) != 0) return NF_DROP; return NF_ACCEPT; } static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, const struct nf_hook_state *state) { struct sock *sk; struct sk_security_struct *sksec; struct common_audit_data ad; struct lsm_network_audit net; u8 proto = 0; sk = skb_to_full_sk(skb); if (sk == NULL) return NF_ACCEPT; sksec = selinux_sock(sk); ad_net_init_from_iif(&ad, &net, state->out->ifindex, state->pf); if (selinux_parse_skb(skb, &ad, NULL, 0, &proto)) return NF_DROP; if (selinux_secmark_enabled()) if (avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET, PACKET__SEND, &ad)) return NF_DROP_ERR(-ECONNREFUSED); if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto)) return NF_DROP_ERR(-ECONNREFUSED); return NF_ACCEPT; } static unsigned int selinux_ip_postroute(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { u16 family; u32 secmark_perm; u32 peer_sid; int ifindex; struct sock *sk; struct common_audit_data ad; struct lsm_network_audit net; char *addrp; int secmark_active, peerlbl_active; /* If any sort of compatibility mode is enabled then handoff processing * to the selinux_ip_postroute_compat() function to deal with the * special handling. We do this in an attempt to keep this function * as fast and as clean as possible. */ if (!selinux_policycap_netpeer()) return selinux_ip_postroute_compat(skb, state); secmark_active = selinux_secmark_enabled(); peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return NF_ACCEPT; sk = skb_to_full_sk(skb); #ifdef CONFIG_XFRM /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec * packet transformation so allow the packet to pass without any checks * since we'll have another chance to perform access control checks * when the packet is on it's final way out. * NOTE: there appear to be some IPv6 multicast cases where skb->dst * is NULL, in this case go ahead and apply access control. * NOTE: if this is a local socket (skb->sk != NULL) that is in the * TCP listening state we cannot wait until the XFRM processing * is done as we will miss out on the SA label if we do; * unfortunately, this means more work, but it is only once per * connection. */ if (skb_dst(skb) != NULL && skb_dst(skb)->xfrm != NULL && !(sk && sk_listener(sk))) return NF_ACCEPT; #endif family = state->pf; if (sk == NULL) { /* Without an associated socket the packet is either coming * from the kernel or it is being forwarded; check the packet * to determine which and if the packet is being forwarded * query the packet directly to determine the security label. */ if (skb->skb_iif) { secmark_perm = PACKET__FORWARD_OUT; if (selinux_skb_peerlbl_sid(skb, family, &peer_sid)) return NF_DROP; } else { secmark_perm = PACKET__SEND; peer_sid = SECINITSID_KERNEL; } } else if (sk_listener(sk)) { /* Locally generated packet but the associated socket is in the * listening state which means this is a SYN-ACK packet. In * this particular case the correct security label is assigned * to the connection/request_sock but unfortunately we can't * query the request_sock as it isn't queued on the parent * socket until after the SYN-ACK packet is sent; the only * viable choice is to regenerate the label like we do in * selinux_inet_conn_request(). See also selinux_ip_output() * for similar problems. */ u32 skb_sid; struct sk_security_struct *sksec; sksec = selinux_sock(sk); if (selinux_skb_peerlbl_sid(skb, family, &skb_sid)) return NF_DROP; /* At this point, if the returned skb peerlbl is SECSID_NULL * and the packet has been through at least one XFRM * transformation then we must be dealing with the "final" * form of labeled IPsec packet; since we've already applied * all of our access controls on this packet we can safely * pass the packet. */ if (skb_sid == SECSID_NULL) { switch (family) { case PF_INET: if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return NF_ACCEPT; break; case PF_INET6: if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return NF_ACCEPT; break; default: return NF_DROP_ERR(-ECONNREFUSED); } } if (selinux_conn_sid(sksec->sid, skb_sid, &peer_sid)) return NF_DROP; secmark_perm = PACKET__SEND; } else { /* Locally generated packet, fetch the security label from the * associated socket. */ struct sk_security_struct *sksec = selinux_sock(sk); peer_sid = sksec->sid; secmark_perm = PACKET__SEND; } ifindex = state->out->ifindex; ad_net_init_from_iif(&ad, &net, ifindex, family); if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL)) return NF_DROP; if (secmark_active) if (avc_has_perm(peer_sid, skb->secmark, SECCLASS_PACKET, secmark_perm, &ad)) return NF_DROP_ERR(-ECONNREFUSED); if (peerlbl_active) { u32 if_sid; u32 node_sid; if (sel_netif_sid(state->net, ifindex, &if_sid)) return NF_DROP; if (avc_has_perm(peer_sid, if_sid, SECCLASS_NETIF, NETIF__EGRESS, &ad)) return NF_DROP_ERR(-ECONNREFUSED); if (sel_netnode_sid(addrp, family, &node_sid)) return NF_DROP; if (avc_has_perm(peer_sid, node_sid, SECCLASS_NODE, NODE__SENDTO, &ad)) return NF_DROP_ERR(-ECONNREFUSED); } return NF_ACCEPT; } #endif /* CONFIG_NETFILTER */ static int nlmsg_sock_has_extended_perms(struct sock *sk, u32 perms, u16 nlmsg_type) { struct sk_security_struct *sksec = sk->sk_security; struct common_audit_data ad; u8 driver; u8 xperm; if (sock_skip_has_perm(sksec->sid)) return 0; ad.type = LSM_AUDIT_DATA_NLMSGTYPE; ad.u.nlmsg_type = nlmsg_type; driver = nlmsg_type >> 8; xperm = nlmsg_type & 0xff; return avc_has_extended_perms(current_sid(), sksec->sid, sksec->sclass, perms, driver, AVC_EXT_NLMSG, xperm, &ad); } static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb) { int rc = 0; unsigned int msg_len; unsigned int data_len = skb->len; unsigned char *data = skb->data; struct nlmsghdr *nlh; struct sk_security_struct *sksec = selinux_sock(sk); u16 sclass = sksec->sclass; u32 perm; while (data_len >= nlmsg_total_size(0)) { nlh = (struct nlmsghdr *)data; /* NOTE: the nlmsg_len field isn't reliably set by some netlink * users which means we can't reject skb's with bogus * length fields; our solution is to follow what * netlink_rcv_skb() does and simply skip processing at * messages with length fields that are clearly junk */ if (nlh->nlmsg_len < NLMSG_HDRLEN || nlh->nlmsg_len > data_len) return 0; rc = selinux_nlmsg_lookup(sclass, nlh->nlmsg_type, &perm); if (rc == 0) { if (selinux_policycap_netlink_xperm()) { rc = nlmsg_sock_has_extended_perms( sk, perm, nlh->nlmsg_type); } else { rc = sock_has_perm(sk, perm); } if (rc) return rc; } else if (rc == -EINVAL) { /* -EINVAL is a missing msg/perm mapping */ pr_warn_ratelimited("SELinux: unrecognized netlink" " message: protocol=%hu nlmsg_type=%hu sclass=%s" " pid=%d comm=%s\n", sk->sk_protocol, nlh->nlmsg_type, secclass_map[sclass - 1].name, task_pid_nr(current), current->comm); if (enforcing_enabled() && !security_get_allow_unknown()) return rc; rc = 0; } else if (rc == -ENOENT) { /* -ENOENT is a missing socket/class mapping, ignore */ rc = 0; } else { return rc; } /* move to the next message after applying netlink padding */ msg_len = NLMSG_ALIGN(nlh->nlmsg_len); if (msg_len >= data_len) return 0; data_len -= msg_len; data += msg_len; } return rc; } static void ipc_init_security(struct ipc_security_struct *isec, u16 sclass) { isec->sclass = sclass; isec->sid = current_sid(); } static int ipc_has_perm(struct kern_ipc_perm *ipc_perms, u32 perms) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(ipc_perms); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = ipc_perms->key; return avc_has_perm(sid, isec->sid, isec->sclass, perms, &ad); } static int selinux_msg_msg_alloc_security(struct msg_msg *msg) { struct msg_security_struct *msec; msec = selinux_msg_msg(msg); msec->sid = SECINITSID_UNLABELED; return 0; } /* message queue security operations */ static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(msq); ipc_init_security(isec, SECCLASS_MSGQ); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; return avc_has_perm(sid, isec->sid, SECCLASS_MSGQ, MSGQ__CREATE, &ad); } static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(msq); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; return avc_has_perm(sid, isec->sid, SECCLASS_MSGQ, MSGQ__ASSOCIATE, &ad); } static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) { u32 perms; switch (cmd) { case IPC_INFO: case MSG_INFO: /* No specific object, just general system-wide information. */ return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case MSG_STAT: case MSG_STAT_ANY: perms = MSGQ__GETATTR | MSGQ__ASSOCIATE; break; case IPC_SET: perms = MSGQ__SETATTR; break; case IPC_RMID: perms = MSGQ__DESTROY; break; default: return 0; } return ipc_has_perm(msq, perms); } static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg) { struct ipc_security_struct *isec; struct msg_security_struct *msec; struct common_audit_data ad; u32 sid = current_sid(); int rc; isec = selinux_ipc(msq); msec = selinux_msg_msg(msg); /* * First time through, need to assign label to the message */ if (msec->sid == SECINITSID_UNLABELED) { /* * Compute new sid based on current process and * message queue this message will be stored in */ rc = security_transition_sid(sid, isec->sid, SECCLASS_MSG, NULL, &msec->sid); if (rc) return rc; } ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; /* Can this process write to the queue? */ rc = avc_has_perm(sid, isec->sid, SECCLASS_MSGQ, MSGQ__WRITE, &ad); if (!rc) /* Can this process send the message */ rc = avc_has_perm(sid, msec->sid, SECCLASS_MSG, MSG__SEND, &ad); if (!rc) /* Can the message be put in the queue? */ rc = avc_has_perm(msec->sid, isec->sid, SECCLASS_MSGQ, MSGQ__ENQUEUE, &ad); return rc; } static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg, struct task_struct *target, long type, int mode) { struct ipc_security_struct *isec; struct msg_security_struct *msec; struct common_audit_data ad; u32 sid = task_sid_obj(target); int rc; isec = selinux_ipc(msq); msec = selinux_msg_msg(msg); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; rc = avc_has_perm(sid, isec->sid, SECCLASS_MSGQ, MSGQ__READ, &ad); if (!rc) rc = avc_has_perm(sid, msec->sid, SECCLASS_MSG, MSG__RECEIVE, &ad); return rc; } /* Shared Memory security operations */ static int selinux_shm_alloc_security(struct kern_ipc_perm *shp) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(shp); ipc_init_security(isec, SECCLASS_SHM); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = shp->key; return avc_has_perm(sid, isec->sid, SECCLASS_SHM, SHM__CREATE, &ad); } static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(shp); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = shp->key; return avc_has_perm(sid, isec->sid, SECCLASS_SHM, SHM__ASSOCIATE, &ad); } /* Note, at this point, shp is locked down */ static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd) { u32 perms; switch (cmd) { case IPC_INFO: case SHM_INFO: /* No specific object, just general system-wide information. */ return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case SHM_STAT: case SHM_STAT_ANY: perms = SHM__GETATTR | SHM__ASSOCIATE; break; case IPC_SET: perms = SHM__SETATTR; break; case SHM_LOCK: case SHM_UNLOCK: perms = SHM__LOCK; break; case IPC_RMID: perms = SHM__DESTROY; break; default: return 0; } return ipc_has_perm(shp, perms); } static int selinux_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg) { u32 perms; if (shmflg & SHM_RDONLY) perms = SHM__READ; else perms = SHM__READ | SHM__WRITE; return ipc_has_perm(shp, perms); } /* Semaphore security operations */ static int selinux_sem_alloc_security(struct kern_ipc_perm *sma) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(sma); ipc_init_security(isec, SECCLASS_SEM); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = sma->key; return avc_has_perm(sid, isec->sid, SECCLASS_SEM, SEM__CREATE, &ad); } static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(sma); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = sma->key; return avc_has_perm(sid, isec->sid, SECCLASS_SEM, SEM__ASSOCIATE, &ad); } /* Note, at this point, sma is locked down */ static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd) { int err; u32 perms; switch (cmd) { case IPC_INFO: case SEM_INFO: /* No specific object, just general system-wide information. */ return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case GETPID: case GETNCNT: case GETZCNT: perms = SEM__GETATTR; break; case GETVAL: case GETALL: perms = SEM__READ; break; case SETVAL: case SETALL: perms = SEM__WRITE; break; case IPC_RMID: perms = SEM__DESTROY; break; case IPC_SET: perms = SEM__SETATTR; break; case IPC_STAT: case SEM_STAT: case SEM_STAT_ANY: perms = SEM__GETATTR | SEM__ASSOCIATE; break; default: return 0; } err = ipc_has_perm(sma, perms); return err; } static int selinux_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter) { u32 perms; if (alter) perms = SEM__READ | SEM__WRITE; else perms = SEM__READ; return ipc_has_perm(sma, perms); } static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { u32 av = 0; av = 0; if (flag & S_IRUGO) av |= IPC__UNIX_READ; if (flag & S_IWUGO) av |= IPC__UNIX_WRITE; if (av == 0) return 0; return ipc_has_perm(ipcp, av); } static void selinux_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop) { struct ipc_security_struct *isec = selinux_ipc(ipcp); prop->selinux.secid = isec->sid; } static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode) { if (inode) inode_doinit_with_dentry(inode, dentry); } static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p, char **value) { const struct cred_security_struct *crsec; int error; u32 sid; u32 len; rcu_read_lock(); crsec = selinux_cred(__task_cred(p)); if (p != current) { error = avc_has_perm(current_sid(), crsec->sid, SECCLASS_PROCESS, PROCESS__GETATTR, NULL); if (error) goto err_unlock; } switch (attr) { case LSM_ATTR_CURRENT: sid = crsec->sid; break; case LSM_ATTR_PREV: sid = crsec->osid; break; case LSM_ATTR_EXEC: sid = crsec->exec_sid; break; case LSM_ATTR_FSCREATE: sid = crsec->create_sid; break; case LSM_ATTR_KEYCREATE: sid = crsec->keycreate_sid; break; case LSM_ATTR_SOCKCREATE: sid = crsec->sockcreate_sid; break; default: error = -EOPNOTSUPP; goto err_unlock; } rcu_read_unlock(); if (sid == SECSID_NULL) { *value = NULL; return 0; } error = security_sid_to_context(sid, value, &len); if (error) return error; return len; err_unlock: rcu_read_unlock(); return error; } static int selinux_lsm_setattr(u64 attr, void *value, size_t size) { struct cred_security_struct *crsec; struct cred *new; u32 mysid = current_sid(), sid = 0, ptsid; int error; char *str = value; /* * Basic control over ability to set these attributes at all. */ switch (attr) { case LSM_ATTR_EXEC: error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETEXEC, NULL); break; case LSM_ATTR_FSCREATE: error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETFSCREATE, NULL); break; case LSM_ATTR_KEYCREATE: error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETKEYCREATE, NULL); break; case LSM_ATTR_SOCKCREATE: error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETSOCKCREATE, NULL); break; case LSM_ATTR_CURRENT: error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS, PROCESS__SETCURRENT, NULL); break; default: error = -EOPNOTSUPP; break; } if (error) return error; /* Obtain a SID for the context, if one was specified. */ if (size && str[0] && str[0] != '\n') { if (str[size-1] == '\n') { str[size-1] = 0; size--; } error = security_context_to_sid(value, size, &sid, GFP_KERNEL); if (error == -EINVAL && attr == LSM_ATTR_FSCREATE) { if (!has_cap_mac_admin(true)) { struct audit_buffer *ab; size_t audit_size; /* We strip a nul only if it is at the end, * otherwise the context contains a nul and * we should audit that */ if (str[size - 1] == '\0') audit_size = size - 1; else audit_size = size; ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); if (!ab) return error; audit_log_format(ab, "op=fscreate invalid_context="); audit_log_n_untrustedstring(ab, value, audit_size); audit_log_end(ab); return error; } error = security_context_to_sid_force(value, size, &sid); } if (error) return error; } new = prepare_creds(); if (!new) return -ENOMEM; /* Permission checking based on the specified context is performed during the actual operation (execve, open/mkdir/...), when we know the full context of the operation. See selinux_bprm_creds_for_exec for the execve checks and may_create for the file creation checks. The operation will then fail if the context is not permitted. */ crsec = selinux_cred(new); if (attr == LSM_ATTR_EXEC) { crsec->exec_sid = sid; } else if (attr == LSM_ATTR_FSCREATE) { crsec->create_sid = sid; } else if (attr == LSM_ATTR_KEYCREATE) { if (sid) { error = avc_has_perm(mysid, sid, SECCLASS_KEY, KEY__CREATE, NULL); if (error) goto abort_change; } crsec->keycreate_sid = sid; } else if (attr == LSM_ATTR_SOCKCREATE) { crsec->sockcreate_sid = sid; } else if (attr == LSM_ATTR_CURRENT) { error = -EINVAL; if (sid == 0) goto abort_change; if (!current_is_single_threaded()) { error = security_bounded_transition(crsec->sid, sid); if (error) goto abort_change; } /* Check permissions for the transition. */ error = avc_has_perm(crsec->sid, sid, SECCLASS_PROCESS, PROCESS__DYNTRANSITION, NULL); if (error) goto abort_change; /* Check for ptracing, and update the task SID if ok. Otherwise, leave SID unchanged and fail. */ ptsid = ptrace_parent_sid(); if (ptsid != 0) { error = avc_has_perm(ptsid, sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (error) goto abort_change; } crsec->sid = sid; } else { error = -EINVAL; goto abort_change; } commit_creds(new); return size; abort_change: abort_creds(new); return error; } /** * selinux_getselfattr - Get SELinux current task attributes * @attr: the requested attribute * @ctx: buffer to receive the result * @size: buffer size (input), buffer size used (output) * @flags: unused * * Fill the passed user space @ctx with the details of the requested * attribute. * * Returns the number of attributes on success, an error code otherwise. * There will only ever be one attribute. */ static int selinux_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, u32 *size, u32 flags) { int rc; char *val = NULL; int val_len; val_len = selinux_lsm_getattr(attr, current, &val); if (val_len < 0) return val_len; rc = lsm_fill_user_ctx(ctx, size, val, val_len, LSM_ID_SELINUX, 0); kfree(val); return (!rc ? 1 : rc); } static int selinux_setselfattr(unsigned int attr, struct lsm_ctx *ctx, u32 size, u32 flags) { int rc; rc = selinux_lsm_setattr(attr, ctx->ctx, ctx->ctx_len); if (rc > 0) return 0; return rc; } static int selinux_getprocattr(struct task_struct *p, const char *name, char **value) { unsigned int attr = lsm_name_to_attr(name); int rc; if (attr) { rc = selinux_lsm_getattr(attr, p, value); if (rc != -EOPNOTSUPP) return rc; } return -EINVAL; } static int selinux_setprocattr(const char *name, void *value, size_t size) { int attr = lsm_name_to_attr(name); if (attr) return selinux_lsm_setattr(attr, value, size); return -EINVAL; } static int selinux_ismaclabel(const char *name) { return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0); } static int selinux_secid_to_secctx(u32 secid, struct lsm_context *cp) { u32 seclen; int ret; if (cp) { cp->id = LSM_ID_SELINUX; ret = security_sid_to_context(secid, &cp->context, &cp->len); if (ret < 0) return ret; return cp->len; } ret = security_sid_to_context(secid, NULL, &seclen); if (ret < 0) return ret; return seclen; } static int selinux_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp) { return selinux_secid_to_secctx(prop->selinux.secid, cp); } static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { return security_context_to_sid(secdata, seclen, secid, GFP_KERNEL); } static void selinux_release_secctx(struct lsm_context *cp) { if (cp->id == LSM_ID_SELINUX) { kfree(cp->context); cp->context = NULL; cp->id = LSM_ID_UNDEF; } } static void selinux_inode_invalidate_secctx(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); spin_lock(&isec->lock); isec->initialized = LABEL_INVALID; spin_unlock(&isec->lock); } /* * called with inode->i_mutex locked */ static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { int rc = selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX, ctx, ctxlen, 0); /* Do not return error when suppressing label (SBLABEL_MNT not set). */ return rc == -EOPNOTSUPP ? 0 : rc; } /* * called with inode->i_mutex locked */ static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { return __vfs_setxattr_locked(&nop_mnt_idmap, dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0, NULL); } static int selinux_inode_getsecctx(struct inode *inode, struct lsm_context *cp) { int len; len = selinux_inode_getsecurity(&nop_mnt_idmap, inode, XATTR_SELINUX_SUFFIX, (void **)&cp->context, true); if (len < 0) return len; cp->len = len; cp->id = LSM_ID_SELINUX; return 0; } #ifdef CONFIG_KEYS static int selinux_key_alloc(struct key *k, const struct cred *cred, unsigned long flags) { const struct cred_security_struct *crsec; struct key_security_struct *ksec = selinux_key(k); crsec = selinux_cred(cred); if (crsec->keycreate_sid) ksec->sid = crsec->keycreate_sid; else ksec->sid = crsec->sid; return 0; } static int selinux_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { struct key *key; struct key_security_struct *ksec; u32 perm, sid; switch (need_perm) { case KEY_NEED_VIEW: perm = KEY__VIEW; break; case KEY_NEED_READ: perm = KEY__READ; break; case KEY_NEED_WRITE: perm = KEY__WRITE; break; case KEY_NEED_SEARCH: perm = KEY__SEARCH; break; case KEY_NEED_LINK: perm = KEY__LINK; break; case KEY_NEED_SETATTR: perm = KEY__SETATTR; break; case KEY_NEED_UNLINK: case KEY_SYSADMIN_OVERRIDE: case KEY_AUTHTOKEN_OVERRIDE: case KEY_DEFER_PERM_CHECK: return 0; default: WARN_ON(1); return -EPERM; } sid = cred_sid(cred); key = key_ref_to_ptr(key_ref); ksec = selinux_key(key); return avc_has_perm(sid, ksec->sid, SECCLASS_KEY, perm, NULL); } static int selinux_key_getsecurity(struct key *key, char **_buffer) { struct key_security_struct *ksec = selinux_key(key); char *context = NULL; unsigned len; int rc; rc = security_sid_to_context(ksec->sid, &context, &len); if (!rc) rc = len; *_buffer = context; return rc; } #ifdef CONFIG_KEY_NOTIFICATIONS static int selinux_watch_key(struct key *key) { struct key_security_struct *ksec = selinux_key(key); u32 sid = current_sid(); return avc_has_perm(sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL); } #endif #endif #ifdef CONFIG_SECURITY_INFINIBAND static int selinux_ib_pkey_access(void *ib_sec, u64 subnet_prefix, u16 pkey_val) { struct common_audit_data ad; int err; u32 sid = 0; struct ib_security_struct *sec = ib_sec; struct lsm_ibpkey_audit ibpkey; err = sel_ib_pkey_sid(subnet_prefix, pkey_val, &sid); if (err) return err; ad.type = LSM_AUDIT_DATA_IBPKEY; ibpkey.subnet_prefix = subnet_prefix; ibpkey.pkey = pkey_val; ad.u.ibpkey = &ibpkey; return avc_has_perm(sec->sid, sid, SECCLASS_INFINIBAND_PKEY, INFINIBAND_PKEY__ACCESS, &ad); } static int selinux_ib_endport_manage_subnet(void *ib_sec, const char *dev_name, u8 port_num) { struct common_audit_data ad; int err; u32 sid = 0; struct ib_security_struct *sec = ib_sec; struct lsm_ibendport_audit ibendport; err = security_ib_endport_sid(dev_name, port_num, &sid); if (err) return err; ad.type = LSM_AUDIT_DATA_IBENDPORT; ibendport.dev_name = dev_name; ibendport.port = port_num; ad.u.ibendport = &ibendport; return avc_has_perm(sec->sid, sid, SECCLASS_INFINIBAND_ENDPORT, INFINIBAND_ENDPORT__MANAGE_SUBNET, &ad); } static int selinux_ib_alloc_security(void *ib_sec) { struct ib_security_struct *sec = selinux_ib(ib_sec); sec->sid = current_sid(); return 0; } #endif #ifdef CONFIG_BPF_SYSCALL static int selinux_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { u32 sid = current_sid(); int ret; switch (cmd) { case BPF_MAP_CREATE: ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__MAP_CREATE, NULL); break; case BPF_PROG_LOAD: ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__PROG_LOAD, NULL); break; default: ret = 0; break; } return ret; } static u32 bpf_map_fmode_to_av(fmode_t fmode) { u32 av = 0; if (fmode & FMODE_READ) av |= BPF__MAP_READ; if (fmode & FMODE_WRITE) av |= BPF__MAP_WRITE; return av; } /* This function will check the file pass through unix socket or binder to see * if it is a bpf related object. And apply corresponding checks on the bpf * object based on the type. The bpf maps and programs, not like other files and * socket, are using a shared anonymous inode inside the kernel as their inode. * So checking that inode cannot identify if the process have privilege to * access the bpf object and that's why we have to add this additional check in * selinux_file_receive and selinux_binder_transfer_files. */ static int bpf_fd_pass(const struct file *file, u32 sid) { struct bpf_security_struct *bpfsec; struct bpf_prog *prog; struct bpf_map *map; int ret; if (file->f_op == &bpf_map_fops) { map = file->private_data; bpfsec = selinux_bpf_map_security(map); ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(file->f_mode), NULL); if (ret) return ret; } else if (file->f_op == &bpf_prog_fops) { prog = file->private_data; bpfsec = selinux_bpf_prog_security(prog); ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); if (ret) return ret; } return 0; } static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode) { u32 sid = current_sid(); struct bpf_security_struct *bpfsec; bpfsec = selinux_bpf_map_security(map); return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(fmode), NULL); } static int selinux_bpf_prog(struct bpf_prog *prog) { u32 sid = current_sid(); struct bpf_security_struct *bpfsec; bpfsec = selinux_bpf_prog_security(prog); return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); } static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, struct bpf_token *token, bool kernel) { struct bpf_security_struct *bpfsec; bpfsec = selinux_bpf_map_security(map); bpfsec->sid = current_sid(); return 0; } static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token, bool kernel) { struct bpf_security_struct *bpfsec; bpfsec = selinux_bpf_prog_security(prog); bpfsec->sid = current_sid(); return 0; } static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { struct bpf_security_struct *bpfsec; bpfsec = selinux_bpf_token_security(token); bpfsec->sid = current_sid(); return 0; } #endif struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct cred_security_struct), .lbs_task = sizeof(struct task_security_struct), .lbs_file = sizeof(struct file_security_struct), .lbs_inode = sizeof(struct inode_security_struct), .lbs_ipc = sizeof(struct ipc_security_struct), .lbs_key = sizeof(struct key_security_struct), .lbs_msg_msg = sizeof(struct msg_security_struct), #ifdef CONFIG_PERF_EVENTS .lbs_perf_event = sizeof(struct perf_event_security_struct), #endif .lbs_sock = sizeof(struct sk_security_struct), .lbs_superblock = sizeof(struct superblock_security_struct), .lbs_xattr_count = SELINUX_INODE_INIT_XATTRS, .lbs_tun_dev = sizeof(struct tun_security_struct), .lbs_ib = sizeof(struct ib_security_struct), .lbs_bpf_map = sizeof(struct bpf_security_struct), .lbs_bpf_prog = sizeof(struct bpf_security_struct), .lbs_bpf_token = sizeof(struct bpf_security_struct), }; #ifdef CONFIG_PERF_EVENTS static int selinux_perf_event_open(int type) { u32 requested, sid = current_sid(); if (type == PERF_SECURITY_OPEN) requested = PERF_EVENT__OPEN; else if (type == PERF_SECURITY_CPU) requested = PERF_EVENT__CPU; else if (type == PERF_SECURITY_KERNEL) requested = PERF_EVENT__KERNEL; else if (type == PERF_SECURITY_TRACEPOINT) requested = PERF_EVENT__TRACEPOINT; else return -EINVAL; return avc_has_perm(sid, sid, SECCLASS_PERF_EVENT, requested, NULL); } static int selinux_perf_event_alloc(struct perf_event *event) { struct perf_event_security_struct *perfsec; perfsec = selinux_perf_event(event->security); perfsec->sid = current_sid(); return 0; } static int selinux_perf_event_read(struct perf_event *event) { struct perf_event_security_struct *perfsec = event->security; u32 sid = current_sid(); return avc_has_perm(sid, perfsec->sid, SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL); } static int selinux_perf_event_write(struct perf_event *event) { struct perf_event_security_struct *perfsec = event->security; u32 sid = current_sid(); return avc_has_perm(sid, perfsec->sid, SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL); } #endif #ifdef CONFIG_IO_URING /** * selinux_uring_override_creds - check the requested cred override * @new: the target creds * * Check to see if the current task is allowed to override it's credentials * to service an io_uring operation. */ static int selinux_uring_override_creds(const struct cred *new) { return avc_has_perm(current_sid(), cred_sid(new), SECCLASS_IO_URING, IO_URING__OVERRIDE_CREDS, NULL); } /** * selinux_uring_sqpoll - check if a io_uring polling thread can be created * * Check to see if the current task is allowed to create a new io_uring * kernel polling thread. */ static int selinux_uring_sqpoll(void) { u32 sid = current_sid(); return avc_has_perm(sid, sid, SECCLASS_IO_URING, IO_URING__SQPOLL, NULL); } /** * selinux_uring_cmd - check if IORING_OP_URING_CMD is allowed * @ioucmd: the io_uring command structure * * Check to see if the current domain is allowed to execute an * IORING_OP_URING_CMD against the device/file specified in @ioucmd. * */ static int selinux_uring_cmd(struct io_uring_cmd *ioucmd) { struct file *file = ioucmd->file; struct inode *inode = file_inode(file); struct inode_security_struct *isec = selinux_inode(inode); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; return avc_has_perm(current_sid(), isec->sid, SECCLASS_IO_URING, IO_URING__CMD, &ad); } /** * selinux_uring_allowed - check if io_uring_setup() can be called * * Check to see if the current task is allowed to call io_uring_setup(). */ static int selinux_uring_allowed(void) { u32 sid = current_sid(); return avc_has_perm(sid, sid, SECCLASS_IO_URING, IO_URING__ALLOWED, NULL); } #endif /* CONFIG_IO_URING */ static const struct lsm_id selinux_lsmid = { .name = "selinux", .id = LSM_ID_SELINUX, }; /* * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order: * 1. any hooks that don't belong to (2.) or (3.) below, * 2. hooks that both access structures allocated by other hooks, and allocate * structures that can be later accessed by other hooks (mostly "cloning" * hooks), * 3. hooks that only allocate structures that can be later accessed by other * hooks ("allocating" hooks). * * Please follow block comment delimiters in the list to keep this order. */ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr), LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction), LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder), LSM_HOOK_INIT(binder_transfer_file, selinux_binder_transfer_file), LSM_HOOK_INIT(ptrace_access_check, selinux_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, selinux_ptrace_traceme), LSM_HOOK_INIT(capget, selinux_capget), LSM_HOOK_INIT(capset, selinux_capset), LSM_HOOK_INIT(capable, selinux_capable), LSM_HOOK_INIT(quotactl, selinux_quotactl), LSM_HOOK_INIT(quota_on, selinux_quota_on), LSM_HOOK_INIT(syslog, selinux_syslog), LSM_HOOK_INIT(vm_enough_memory, selinux_vm_enough_memory), LSM_HOOK_INIT(netlink_send, selinux_netlink_send), LSM_HOOK_INIT(bprm_creds_for_exec, selinux_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds), LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts), LSM_HOOK_INIT(sb_mnt_opts_compat, selinux_sb_mnt_opts_compat), LSM_HOOK_INIT(sb_remount, selinux_sb_remount), LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount), LSM_HOOK_INIT(sb_show_options, selinux_sb_show_options), LSM_HOOK_INIT(sb_statfs, selinux_sb_statfs), LSM_HOOK_INIT(sb_mount, selinux_mount), LSM_HOOK_INIT(sb_umount, selinux_umount), LSM_HOOK_INIT(sb_set_mnt_opts, selinux_set_mnt_opts), LSM_HOOK_INIT(sb_clone_mnt_opts, selinux_sb_clone_mnt_opts), LSM_HOOK_INIT(move_mount, selinux_move_mount), LSM_HOOK_INIT(dentry_init_security, selinux_dentry_init_security), LSM_HOOK_INIT(dentry_create_files_as, selinux_dentry_create_files_as), LSM_HOOK_INIT(inode_free_security, selinux_inode_free_security), LSM_HOOK_INIT(inode_init_security, selinux_inode_init_security), LSM_HOOK_INIT(inode_init_security_anon, selinux_inode_init_security_anon), LSM_HOOK_INIT(inode_create, selinux_inode_create), LSM_HOOK_INIT(inode_link, selinux_inode_link), LSM_HOOK_INIT(inode_unlink, selinux_inode_unlink), LSM_HOOK_INIT(inode_symlink, selinux_inode_symlink), LSM_HOOK_INIT(inode_mkdir, selinux_inode_mkdir), LSM_HOOK_INIT(inode_rmdir, selinux_inode_rmdir), LSM_HOOK_INIT(inode_mknod, selinux_inode_mknod), LSM_HOOK_INIT(inode_rename, selinux_inode_rename), LSM_HOOK_INIT(inode_readlink, selinux_inode_readlink), LSM_HOOK_INIT(inode_follow_link, selinux_inode_follow_link), LSM_HOOK_INIT(inode_permission, selinux_inode_permission), LSM_HOOK_INIT(inode_setattr, selinux_inode_setattr), LSM_HOOK_INIT(inode_getattr, selinux_inode_getattr), LSM_HOOK_INIT(inode_xattr_skipcap, selinux_inode_xattr_skipcap), LSM_HOOK_INIT(inode_setxattr, selinux_inode_setxattr), LSM_HOOK_INIT(inode_post_setxattr, selinux_inode_post_setxattr), LSM_HOOK_INIT(inode_getxattr, selinux_inode_getxattr), LSM_HOOK_INIT(inode_listxattr, selinux_inode_listxattr), LSM_HOOK_INIT(inode_removexattr, selinux_inode_removexattr), LSM_HOOK_INIT(inode_file_getattr, selinux_inode_file_getattr), LSM_HOOK_INIT(inode_file_setattr, selinux_inode_file_setattr), LSM_HOOK_INIT(inode_set_acl, selinux_inode_set_acl), LSM_HOOK_INIT(inode_get_acl, selinux_inode_get_acl), LSM_HOOK_INIT(inode_remove_acl, selinux_inode_remove_acl), LSM_HOOK_INIT(inode_getsecurity, selinux_inode_getsecurity), LSM_HOOK_INIT(inode_setsecurity, selinux_inode_setsecurity), LSM_HOOK_INIT(inode_listsecurity, selinux_inode_listsecurity), LSM_HOOK_INIT(inode_getlsmprop, selinux_inode_getlsmprop), LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up), LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr), LSM_HOOK_INIT(path_notify, selinux_path_notify), LSM_HOOK_INIT(kernfs_init_security, selinux_kernfs_init_security), LSM_HOOK_INIT(file_permission, selinux_file_permission), LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security), LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl), LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat), LSM_HOOK_INIT(mmap_file, selinux_mmap_file), LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr), LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect), LSM_HOOK_INIT(file_lock, selinux_file_lock), LSM_HOOK_INIT(file_fcntl, selinux_file_fcntl), LSM_HOOK_INIT(file_set_fowner, selinux_file_set_fowner), LSM_HOOK_INIT(file_send_sigiotask, selinux_file_send_sigiotask), LSM_HOOK_INIT(file_receive, selinux_file_receive), LSM_HOOK_INIT(file_open, selinux_file_open), LSM_HOOK_INIT(task_alloc, selinux_task_alloc), LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare), LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer), LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid), LSM_HOOK_INIT(cred_getlsmprop, selinux_cred_getlsmprop), LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as), LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as), LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request), LSM_HOOK_INIT(kernel_load_data, selinux_kernel_load_data), LSM_HOOK_INIT(kernel_read_file, selinux_kernel_read_file), LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid), LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid), LSM_HOOK_INIT(task_getsid, selinux_task_getsid), LSM_HOOK_INIT(current_getlsmprop_subj, selinux_current_getlsmprop_subj), LSM_HOOK_INIT(task_getlsmprop_obj, selinux_task_getlsmprop_obj), LSM_HOOK_INIT(task_setnice, selinux_task_setnice), LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio), LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio), LSM_HOOK_INIT(task_prlimit, selinux_task_prlimit), LSM_HOOK_INIT(task_setrlimit, selinux_task_setrlimit), LSM_HOOK_INIT(task_setscheduler, selinux_task_setscheduler), LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler), LSM_HOOK_INIT(task_movememory, selinux_task_movememory), LSM_HOOK_INIT(task_kill, selinux_task_kill), LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode), LSM_HOOK_INIT(userns_create, selinux_userns_create), LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission), LSM_HOOK_INIT(ipc_getlsmprop, selinux_ipc_getlsmprop), LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate), LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl), LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd), LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv), LSM_HOOK_INIT(shm_associate, selinux_shm_associate), LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl), LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat), LSM_HOOK_INIT(sem_associate, selinux_sem_associate), LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl), LSM_HOOK_INIT(sem_semop, selinux_sem_semop), LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate), LSM_HOOK_INIT(getselfattr, selinux_getselfattr), LSM_HOOK_INIT(setselfattr, selinux_setselfattr), LSM_HOOK_INIT(getprocattr, selinux_getprocattr), LSM_HOOK_INIT(setprocattr, selinux_setprocattr), LSM_HOOK_INIT(ismaclabel, selinux_ismaclabel), LSM_HOOK_INIT(secctx_to_secid, selinux_secctx_to_secid), LSM_HOOK_INIT(release_secctx, selinux_release_secctx), LSM_HOOK_INIT(inode_invalidate_secctx, selinux_inode_invalidate_secctx), LSM_HOOK_INIT(inode_notifysecctx, selinux_inode_notifysecctx), LSM_HOOK_INIT(inode_setsecctx, selinux_inode_setsecctx), LSM_HOOK_INIT(unix_stream_connect, selinux_socket_unix_stream_connect), LSM_HOOK_INIT(unix_may_send, selinux_socket_unix_may_send), LSM_HOOK_INIT(socket_create, selinux_socket_create), LSM_HOOK_INIT(socket_post_create, selinux_socket_post_create), LSM_HOOK_INIT(socket_socketpair, selinux_socket_socketpair), LSM_HOOK_INIT(socket_bind, selinux_socket_bind), LSM_HOOK_INIT(socket_connect, selinux_socket_connect), LSM_HOOK_INIT(socket_listen, selinux_socket_listen), LSM_HOOK_INIT(socket_accept, selinux_socket_accept), LSM_HOOK_INIT(socket_sendmsg, selinux_socket_sendmsg), LSM_HOOK_INIT(socket_recvmsg, selinux_socket_recvmsg), LSM_HOOK_INIT(socket_getsockname, selinux_socket_getsockname), LSM_HOOK_INIT(socket_getpeername, selinux_socket_getpeername), LSM_HOOK_INIT(socket_getsockopt, selinux_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, selinux_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, selinux_socket_shutdown), LSM_HOOK_INIT(socket_sock_rcv_skb, selinux_socket_sock_rcv_skb), LSM_HOOK_INIT(socket_getpeersec_stream, selinux_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, selinux_socket_getpeersec_dgram), LSM_HOOK_INIT(sk_free_security, selinux_sk_free_security), LSM_HOOK_INIT(sk_clone_security, selinux_sk_clone_security), LSM_HOOK_INIT(sk_getsecid, selinux_sk_getsecid), LSM_HOOK_INIT(sock_graft, selinux_sock_graft), LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request), LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone), LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect), LSM_HOOK_INIT(sctp_assoc_established, selinux_sctp_assoc_established), LSM_HOOK_INIT(mptcp_add_subflow, selinux_mptcp_add_subflow), LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request), LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone), LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established), LSM_HOOK_INIT(secmark_relabel_packet, selinux_secmark_relabel_packet), LSM_HOOK_INIT(secmark_refcount_inc, selinux_secmark_refcount_inc), LSM_HOOK_INIT(secmark_refcount_dec, selinux_secmark_refcount_dec), LSM_HOOK_INIT(req_classify_flow, selinux_req_classify_flow), LSM_HOOK_INIT(tun_dev_create, selinux_tun_dev_create), LSM_HOOK_INIT(tun_dev_attach_queue, selinux_tun_dev_attach_queue), LSM_HOOK_INIT(tun_dev_attach, selinux_tun_dev_attach), LSM_HOOK_INIT(tun_dev_open, selinux_tun_dev_open), #ifdef CONFIG_SECURITY_INFINIBAND LSM_HOOK_INIT(ib_pkey_access, selinux_ib_pkey_access), LSM_HOOK_INIT(ib_endport_manage_subnet, selinux_ib_endport_manage_subnet), #endif #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK_INIT(xfrm_policy_free_security, selinux_xfrm_policy_free), LSM_HOOK_INIT(xfrm_policy_delete_security, selinux_xfrm_policy_delete), LSM_HOOK_INIT(xfrm_state_free_security, selinux_xfrm_state_free), LSM_HOOK_INIT(xfrm_state_delete_security, selinux_xfrm_state_delete), LSM_HOOK_INIT(xfrm_policy_lookup, selinux_xfrm_policy_lookup), LSM_HOOK_INIT(xfrm_state_pol_flow_match, selinux_xfrm_state_pol_flow_match), LSM_HOOK_INIT(xfrm_decode_session, selinux_xfrm_decode_session), #endif #ifdef CONFIG_KEYS LSM_HOOK_INIT(key_permission, selinux_key_permission), LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity), #ifdef CONFIG_KEY_NOTIFICATIONS LSM_HOOK_INIT(watch_key, selinux_watch_key), #endif #endif #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_known, selinux_audit_rule_known), LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match), LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free), #endif #ifdef CONFIG_BPF_SYSCALL LSM_HOOK_INIT(bpf, selinux_bpf), LSM_HOOK_INIT(bpf_map, selinux_bpf_map), LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog), #endif #ifdef CONFIG_PERF_EVENTS LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open), LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read), LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write), #endif #ifdef CONFIG_IO_URING LSM_HOOK_INIT(uring_override_creds, selinux_uring_override_creds), LSM_HOOK_INIT(uring_sqpoll, selinux_uring_sqpoll), LSM_HOOK_INIT(uring_cmd, selinux_uring_cmd), LSM_HOOK_INIT(uring_allowed, selinux_uring_allowed), #endif /* * PUT "CLONING" (ACCESSING + ALLOCATING) HOOKS HERE */ LSM_HOOK_INIT(fs_context_submount, selinux_fs_context_submount), LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup), LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param), LSM_HOOK_INIT(sb_eat_lsm_opts, selinux_sb_eat_lsm_opts), #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK_INIT(xfrm_policy_clone_security, selinux_xfrm_policy_clone), #endif /* * PUT "ALLOCATING" HOOKS HERE */ LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security), LSM_HOOK_INIT(msg_queue_alloc_security, selinux_msg_queue_alloc_security), LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security), LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security), LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security), LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security), LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx), LSM_HOOK_INIT(lsmprop_to_secctx, selinux_lsmprop_to_secctx), LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx), LSM_HOOK_INIT(sk_alloc_security, selinux_sk_alloc_security), LSM_HOOK_INIT(tun_dev_alloc_security, selinux_tun_dev_alloc_security), #ifdef CONFIG_SECURITY_INFINIBAND LSM_HOOK_INIT(ib_alloc_security, selinux_ib_alloc_security), #endif #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK_INIT(xfrm_policy_alloc_security, selinux_xfrm_policy_alloc), LSM_HOOK_INIT(xfrm_state_alloc, selinux_xfrm_state_alloc), LSM_HOOK_INIT(xfrm_state_alloc_acquire, selinux_xfrm_state_alloc_acquire), #endif #ifdef CONFIG_KEYS LSM_HOOK_INIT(key_alloc, selinux_key_alloc), #endif #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init), #endif #ifdef CONFIG_BPF_SYSCALL LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create), LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load), LSM_HOOK_INIT(bpf_token_create, selinux_bpf_token_create), #endif #ifdef CONFIG_PERF_EVENTS LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc), #endif }; static __init int selinux_init(void) { pr_info("SELinux: Initializing.\n"); memset(&selinux_state, 0, sizeof(selinux_state)); enforcing_set(selinux_enforcing_boot); selinux_avc_init(); mutex_init(&selinux_state.status_lock); mutex_init(&selinux_state.policy_mutex); /* Set the security state for the initial task. */ cred_init_security(); /* Inform the audit system that secctx is used */ audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT | AUDIT_CFG_LSM_SECCTX_OBJECT); default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); avc_init(); avtab_cache_init(); ebitmap_cache_init(); hashtab_cache_init(); security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), &selinux_lsmid); if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET)) panic("SELinux: Unable to register AVC netcache callback\n"); if (avc_add_callback(selinux_lsm_notifier_avc_callback, AVC_CALLBACK_RESET)) panic("SELinux: Unable to register AVC LSM notifier callback\n"); if (avc_add_callback(selinux_audit_rule_avc_callback, AVC_CALLBACK_RESET)) panic("SELinux: Unable to register AVC audit callback\n"); if (selinux_enforcing_boot) pr_debug("SELinux: Starting in enforcing mode\n"); else pr_debug("SELinux: Starting in permissive mode\n"); fs_validate_description("selinux", selinux_fs_parameters); return 0; } static void delayed_superblock_init(struct super_block *sb, void *unused) { selinux_set_mnt_opts(sb, NULL, 0, NULL); } void selinux_complete_init(void) { pr_debug("SELinux: Completing initialization.\n"); /* Set up any superblocks initialized prior to the policy load. */ pr_debug("SELinux: Setting up existing superblocks.\n"); iterate_supers(delayed_superblock_init, NULL); } /* SELinux requires early initialization in order to label all processes and objects when they are created. */ DEFINE_LSM(selinux) = { .id = &selinux_lsmid, .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &selinux_enabled_boot, .blobs = &selinux_blob_sizes, .init = selinux_init, .initcall_device = selinux_initcall, }; #if defined(CONFIG_NETFILTER) static const struct nf_hook_ops selinux_nf_ops[] = { { .hook = selinux_ip_postroute, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_SELINUX_LAST, }, { .hook = selinux_ip_forward, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_SELINUX_FIRST, }, { .hook = selinux_ip_output, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_SELINUX_FIRST, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = selinux_ip_postroute, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_SELINUX_LAST, }, { .hook = selinux_ip_forward, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = NF_IP6_PRI_SELINUX_FIRST, }, { .hook = selinux_ip_output, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_SELINUX_FIRST, }, #endif /* IPV6 */ }; static int __net_init selinux_nf_register(struct net *net) { return nf_register_net_hooks(net, selinux_nf_ops, ARRAY_SIZE(selinux_nf_ops)); } static void __net_exit selinux_nf_unregister(struct net *net) { nf_unregister_net_hooks(net, selinux_nf_ops, ARRAY_SIZE(selinux_nf_ops)); } static struct pernet_operations selinux_net_ops = { .init = selinux_nf_register, .exit = selinux_nf_unregister, }; int __init selinux_nf_ip_init(void) { int err; if (!selinux_enabled_boot) return 0; pr_debug("SELinux: Registering netfilter hooks\n"); err = register_pernet_subsys(&selinux_net_ops); if (err) panic("SELinux: register_pernet_subsys: error %d\n", err); return 0; } #endif /* CONFIG_NETFILTER */ |
| 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/sched.c * * Scheduling for synchronous and asynchronous RPC requests. * * Copyright (C) 1996 Olaf Kirch, <okir@monad.swb.de> * * TCP NFS related read + write fixes * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie> */ #include <linux/module.h> #include <linux/sched.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/mempool.h> #include <linux/smp.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/freezer.h> #include <linux/sched/mm.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/metrics.h> #include "sunrpc.h" #define CREATE_TRACE_POINTS #include <trace/events/sunrpc.h> /* * RPC slabs and memory pools */ #define RPC_BUFFER_MAXSIZE (2048) #define RPC_BUFFER_POOLSIZE (8) #define RPC_TASK_POOLSIZE (8) static struct kmem_cache *rpc_task_slabp __read_mostly; static struct kmem_cache *rpc_buffer_slabp __read_mostly; static mempool_t *rpc_task_mempool __read_mostly; static mempool_t *rpc_buffer_mempool __read_mostly; static void rpc_async_schedule(struct work_struct *); static void rpc_release_task(struct rpc_task *task); static void __rpc_queue_timer_fn(struct work_struct *); /* * RPC tasks sit here while waiting for conditions to improve. */ static struct rpc_wait_queue delay_queue; /* * rpciod-related stuff */ struct workqueue_struct *rpciod_workqueue __read_mostly; struct workqueue_struct *xprtiod_workqueue __read_mostly; EXPORT_SYMBOL_GPL(xprtiod_workqueue); gfp_t rpc_task_gfp_mask(void) { if (current->flags & PF_WQ_WORKER) return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; return GFP_KERNEL; } EXPORT_SYMBOL_GPL(rpc_task_gfp_mask); bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status) { if (cmpxchg(&task->tk_rpc_status, 0, rpc_status) == 0) return true; return false; } unsigned long rpc_task_timeout(const struct rpc_task *task) { unsigned long timeout = READ_ONCE(task->tk_timeout); if (timeout != 0) { unsigned long now = jiffies; if (time_before(now, timeout)) return timeout - now; } return 0; } EXPORT_SYMBOL_GPL(rpc_task_timeout); /* * Disable the timer for a given RPC task. Should be called with * queue->lock and bh_disabled in order to avoid races within * rpc_run_timer(). */ static void __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task) { if (list_empty(&task->u.tk_wait.timer_list)) return; task->tk_timeout = 0; list_del(&task->u.tk_wait.timer_list); if (list_empty(&queue->timer_list.list)) cancel_delayed_work(&queue->timer_list.dwork); } static void rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires) { unsigned long now = jiffies; queue->timer_list.expires = expires; if (time_before_eq(expires, now)) expires = 0; else expires -= now; mod_delayed_work(rpciod_workqueue, &queue->timer_list.dwork, expires); } /* * Set up a timer for the current task. */ static void __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task, unsigned long timeout) { task->tk_timeout = timeout; if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires)) rpc_set_queue_timer(queue, timeout); list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); } static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) { if (queue->priority != priority) { queue->priority = priority; queue->nr = 1U << priority; } } static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) { rpc_set_waitqueue_priority(queue, queue->maxpriority); } /* * Add a request to a queue list */ static void __rpc_list_enqueue_task(struct list_head *q, struct rpc_task *task) { struct rpc_task *t; list_for_each_entry(t, q, u.tk_wait.list) { if (t->tk_owner == task->tk_owner) { list_add_tail(&task->u.tk_wait.links, &t->u.tk_wait.links); /* Cache the queue head in task->u.tk_wait.list */ task->u.tk_wait.list.next = q; task->u.tk_wait.list.prev = NULL; return; } } INIT_LIST_HEAD(&task->u.tk_wait.links); list_add_tail(&task->u.tk_wait.list, q); } /* * Remove request from a queue list */ static void __rpc_list_dequeue_task(struct rpc_task *task) { struct list_head *q; struct rpc_task *t; if (task->u.tk_wait.list.prev == NULL) { list_del(&task->u.tk_wait.links); return; } if (!list_empty(&task->u.tk_wait.links)) { t = list_first_entry(&task->u.tk_wait.links, struct rpc_task, u.tk_wait.links); /* Assume __rpc_list_enqueue_task() cached the queue head */ q = t->u.tk_wait.list.next; list_add_tail(&t->u.tk_wait.list, q); list_del(&task->u.tk_wait.links); } list_del(&task->u.tk_wait.list); } /* * Add new request to a priority queue. */ static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, struct rpc_task *task, unsigned char queue_priority) { if (unlikely(queue_priority > queue->maxpriority)) queue_priority = queue->maxpriority; __rpc_list_enqueue_task(&queue->tasks[queue_priority], task); } /* * Add new request to wait queue. */ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task, unsigned char queue_priority) { INIT_LIST_HEAD(&task->u.tk_wait.timer_list); if (RPC_IS_PRIORITY(queue)) __rpc_add_wait_queue_priority(queue, task, queue_priority); else list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); task->tk_waitqueue = queue; queue->qlen++; /* barrier matches the read in rpc_wake_up_task_queue_locked() */ smp_wmb(); rpc_set_queued(task); } /* * Remove request from a priority queue. */ static void __rpc_remove_wait_queue_priority(struct rpc_task *task) { __rpc_list_dequeue_task(task); } /* * Remove request from queue. * Note: must be called with spin lock held. */ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) { __rpc_disable_timer(queue, task); if (RPC_IS_PRIORITY(queue)) __rpc_remove_wait_queue_priority(task); else list_del(&task->u.tk_wait.list); queue->qlen--; } static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues) { int i; spin_lock_init(&queue->lock); for (i = 0; i < ARRAY_SIZE(queue->tasks); i++) INIT_LIST_HEAD(&queue->tasks[i]); queue->maxpriority = nr_queues - 1; rpc_reset_waitqueue_priority(queue); queue->qlen = 0; queue->timer_list.expires = 0; INIT_DELAYED_WORK(&queue->timer_list.dwork, __rpc_queue_timer_fn); INIT_LIST_HEAD(&queue->timer_list.list); rpc_assign_waitqueue_name(queue, qname); } void rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname) { __rpc_init_priority_wait_queue(queue, qname, RPC_NR_PRIORITY); } EXPORT_SYMBOL_GPL(rpc_init_priority_wait_queue); void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname) { __rpc_init_priority_wait_queue(queue, qname, 1); } EXPORT_SYMBOL_GPL(rpc_init_wait_queue); void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) { cancel_delayed_work_sync(&queue->timer_list.dwork); } EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) static void rpc_task_set_debuginfo(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; /* Might be a task carrying a reverse-direction operation */ if (!clnt) { static atomic_t rpc_pid; task->tk_pid = atomic_inc_return(&rpc_pid); return; } task->tk_pid = atomic_inc_return(&clnt->cl_pid); } #else static inline void rpc_task_set_debuginfo(struct rpc_task *task) { } #endif static void rpc_set_active(struct rpc_task *task) { rpc_task_set_debuginfo(task); set_bit(RPC_TASK_ACTIVE, &task->tk_runstate); trace_rpc_task_begin(task, NULL); } /* * Mark an RPC call as having completed by clearing the 'active' bit * and then waking up all tasks that were sleeping. */ static int rpc_complete_task(struct rpc_task *task) { void *m = &task->tk_runstate; wait_queue_head_t *wq = bit_waitqueue(m, RPC_TASK_ACTIVE); struct wait_bit_key k = __WAIT_BIT_KEY_INITIALIZER(m, RPC_TASK_ACTIVE); unsigned long flags; int ret; trace_rpc_task_complete(task, NULL); spin_lock_irqsave(&wq->lock, flags); clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); ret = atomic_dec_and_test(&task->tk_count); if (waitqueue_active(wq)) __wake_up_locked_key(wq, TASK_NORMAL, &k); spin_unlock_irqrestore(&wq->lock, flags); return ret; } /* * Allow callers to wait for completion of an RPC call * * Note the use of out_of_line_wait_on_bit() rather than wait_on_bit() * to enforce taking of the wq->lock and hence avoid races with * rpc_complete_task(). */ int rpc_wait_for_completion_task(struct rpc_task *task) { return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE, rpc_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); } EXPORT_SYMBOL_GPL(rpc_wait_for_completion_task); /* * Make an RPC task runnable. * * Note: If the task is ASYNC, and is being made runnable after sitting on an * rpc_wait_queue, this must be called with the queue spinlock held to protect * the wait queue operation. * Note the ordering of rpc_test_and_set_running() and rpc_clear_queued(), * which is needed to ensure that __rpc_execute() doesn't loop (due to the * lockless RPC_IS_QUEUED() test) before we've had a chance to test * the RPC_TASK_RUNNING flag. */ static void rpc_make_runnable(struct workqueue_struct *wq, struct rpc_task *task) { bool need_wakeup = !rpc_test_and_set_running(task); rpc_clear_queued(task); if (!need_wakeup) return; if (RPC_IS_ASYNC(task)) { INIT_WORK(&task->u.tk_work, rpc_async_schedule); queue_work(wq, &task->u.tk_work); } else { smp_mb__after_atomic(); wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); } } /* * Prepare for sleeping on a wait queue. * By always appending tasks to the list we ensure FIFO behavior. * NB: An RPC task will only receive interrupt-driven events as long * as it's on a wait queue. */ static void __rpc_do_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, unsigned char queue_priority) { trace_rpc_task_sleep(task, q); __rpc_add_wait_queue(q, task, queue_priority); } static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, unsigned char queue_priority) { if (WARN_ON_ONCE(RPC_IS_QUEUED(task))) return; __rpc_do_sleep_on_priority(q, task, queue_priority); } static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q, struct rpc_task *task, unsigned long timeout, unsigned char queue_priority) { if (WARN_ON_ONCE(RPC_IS_QUEUED(task))) return; if (time_is_after_jiffies(timeout)) { __rpc_do_sleep_on_priority(q, task, queue_priority); __rpc_add_timer(q, task, timeout); } else task->tk_status = -ETIMEDOUT; } static void rpc_set_tk_callback(struct rpc_task *task, rpc_action action) { if (action && !WARN_ON_ONCE(task->tk_callback != NULL)) task->tk_callback = action; } static bool rpc_sleep_check_activated(struct rpc_task *task) { /* We shouldn't ever put an inactive task to sleep */ if (WARN_ON_ONCE(!RPC_IS_ACTIVATED(task))) { task->tk_status = -EIO; rpc_put_task_async(task); return false; } return true; } void rpc_sleep_on_timeout(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action, unsigned long timeout) { if (!rpc_sleep_check_activated(task)) return; rpc_set_tk_callback(task, action); /* * Protect the queue operations. */ spin_lock(&q->lock); __rpc_sleep_on_priority_timeout(q, task, timeout, task->tk_priority); spin_unlock(&q->lock); } EXPORT_SYMBOL_GPL(rpc_sleep_on_timeout); void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action) { if (!rpc_sleep_check_activated(task)) return; rpc_set_tk_callback(task, action); WARN_ON_ONCE(task->tk_timeout != 0); /* * Protect the queue operations. */ spin_lock(&q->lock); __rpc_sleep_on_priority(q, task, task->tk_priority); spin_unlock(&q->lock); } EXPORT_SYMBOL_GPL(rpc_sleep_on); void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q, struct rpc_task *task, unsigned long timeout, int priority) { if (!rpc_sleep_check_activated(task)) return; priority -= RPC_PRIORITY_LOW; /* * Protect the queue operations. */ spin_lock(&q->lock); __rpc_sleep_on_priority_timeout(q, task, timeout, priority); spin_unlock(&q->lock); } EXPORT_SYMBOL_GPL(rpc_sleep_on_priority_timeout); void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, int priority) { if (!rpc_sleep_check_activated(task)) return; WARN_ON_ONCE(task->tk_timeout != 0); priority -= RPC_PRIORITY_LOW; /* * Protect the queue operations. */ spin_lock(&q->lock); __rpc_sleep_on_priority(q, task, priority); spin_unlock(&q->lock); } EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); /** * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task * @wq: workqueue on which to run task * @queue: wait queue * @task: task to be woken up * * Caller must hold queue->lock, and have cleared the task queued flag. */ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, struct rpc_wait_queue *queue, struct rpc_task *task) { /* Has the task been executed yet? If not, we cannot wake it up! */ if (!RPC_IS_ACTIVATED(task)) { printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); return; } trace_rpc_task_wakeup(task, queue); __rpc_remove_wait_queue(queue, task); rpc_make_runnable(wq, task); } /* * Wake up a queued task while the queue lock is being held */ static struct rpc_task * rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq, struct rpc_wait_queue *queue, struct rpc_task *task, bool (*action)(struct rpc_task *, void *), void *data) { if (RPC_IS_QUEUED(task)) { smp_rmb(); if (task->tk_waitqueue == queue) { if (action == NULL || action(task, data)) { __rpc_do_wake_up_task_on_wq(wq, queue, task); return task; } } } return NULL; } /* * Wake up a queued task while the queue lock is being held */ static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) { rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue, task, NULL, NULL); } /* * Wake up a task on a specific queue */ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) { if (!RPC_IS_QUEUED(task)) return; spin_lock(&queue->lock); rpc_wake_up_task_queue_locked(queue, task); spin_unlock(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); static bool rpc_task_action_set_status(struct rpc_task *task, void *status) { task->tk_status = *(int *)status; return true; } static void rpc_wake_up_task_queue_set_status_locked(struct rpc_wait_queue *queue, struct rpc_task *task, int status) { rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue, task, rpc_task_action_set_status, &status); } /** * rpc_wake_up_queued_task_set_status - wake up a task and set task->tk_status * @queue: pointer to rpc_wait_queue * @task: pointer to rpc_task * @status: integer error value * * If @task is queued on @queue, then it is woken up, and @task->tk_status is * set to the value of @status. */ void rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *queue, struct rpc_task *task, int status) { if (!RPC_IS_QUEUED(task)) return; spin_lock(&queue->lock); rpc_wake_up_task_queue_set_status_locked(queue, task, status); spin_unlock(&queue->lock); } /* * Wake up the next task on a priority queue. */ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *queue) { struct list_head *q; struct rpc_task *task; /* * Service the privileged queue. */ q = &queue->tasks[RPC_NR_PRIORITY - 1]; if (queue->maxpriority > RPC_PRIORITY_PRIVILEGED && !list_empty(q)) { task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto out; } /* * Service a batch of tasks from a single owner. */ q = &queue->tasks[queue->priority]; if (!list_empty(q) && queue->nr) { queue->nr--; task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto out; } /* * Service the next queue. */ do { if (q == &queue->tasks[0]) q = &queue->tasks[queue->maxpriority]; else q = q - 1; if (!list_empty(q)) { task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto new_queue; } } while (q != &queue->tasks[queue->priority]); rpc_reset_waitqueue_priority(queue); return NULL; new_queue: rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0])); out: return task; } static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue) { if (RPC_IS_PRIORITY(queue)) return __rpc_find_next_queued_priority(queue); if (!list_empty(&queue->tasks[0])) return list_first_entry(&queue->tasks[0], struct rpc_task, u.tk_wait.list); return NULL; } /* * Wake up the first task on the wait queue. */ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, struct rpc_wait_queue *queue, bool (*func)(struct rpc_task *, void *), void *data) { struct rpc_task *task = NULL; spin_lock(&queue->lock); task = __rpc_find_next_queued(queue); if (task != NULL) task = rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, func, data); spin_unlock(&queue->lock); return task; } /* * Wake up the first task on the wait queue. */ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, bool (*func)(struct rpc_task *, void *), void *data) { return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data); } EXPORT_SYMBOL_GPL(rpc_wake_up_first); static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) { return true; } /* * Wake up the next task on the wait queue. */ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue) { return rpc_wake_up_first(queue, rpc_wake_up_next_func, NULL); } EXPORT_SYMBOL_GPL(rpc_wake_up_next); /** * rpc_wake_up_locked - wake up all rpc_tasks * @queue: rpc_wait_queue on which the tasks are sleeping * */ static void rpc_wake_up_locked(struct rpc_wait_queue *queue) { struct rpc_task *task; for (;;) { task = __rpc_find_next_queued(queue); if (task == NULL) break; rpc_wake_up_task_queue_locked(queue, task); } } /** * rpc_wake_up - wake up all rpc_tasks * @queue: rpc_wait_queue on which the tasks are sleeping * * Grabs queue->lock */ void rpc_wake_up(struct rpc_wait_queue *queue) { spin_lock(&queue->lock); rpc_wake_up_locked(queue); spin_unlock(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up); /** * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value. * @queue: rpc_wait_queue on which the tasks are sleeping * @status: status value to set */ static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status) { struct rpc_task *task; for (;;) { task = __rpc_find_next_queued(queue); if (task == NULL) break; rpc_wake_up_task_queue_set_status_locked(queue, task, status); } } /** * rpc_wake_up_status - wake up all rpc_tasks and set their status value. * @queue: rpc_wait_queue on which the tasks are sleeping * @status: status value to set * * Grabs queue->lock */ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) { spin_lock(&queue->lock); rpc_wake_up_status_locked(queue, status); spin_unlock(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_status); static void __rpc_queue_timer_fn(struct work_struct *work) { struct rpc_wait_queue *queue = container_of(work, struct rpc_wait_queue, timer_list.dwork.work); struct rpc_task *task, *n; unsigned long expires, now, timeo; spin_lock(&queue->lock); expires = now = jiffies; list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) { timeo = task->tk_timeout; if (time_after_eq(now, timeo)) { trace_rpc_task_timeout(task, task->tk_action); task->tk_status = -ETIMEDOUT; rpc_wake_up_task_queue_locked(queue, task); continue; } if (expires == now || time_after(expires, timeo)) expires = timeo; } if (!list_empty(&queue->timer_list.list)) rpc_set_queue_timer(queue, expires); spin_unlock(&queue->lock); } static void __rpc_atrun(struct rpc_task *task) { if (task->tk_status == -ETIMEDOUT) task->tk_status = 0; } /* * Run a task at a later time */ void rpc_delay(struct rpc_task *task, unsigned long delay) { rpc_sleep_on_timeout(&delay_queue, task, __rpc_atrun, jiffies + delay); } EXPORT_SYMBOL_GPL(rpc_delay); /* * Helper to call task->tk_ops->rpc_call_prepare */ void rpc_prepare_task(struct rpc_task *task) { task->tk_ops->rpc_call_prepare(task, task->tk_calldata); } static void rpc_init_task_statistics(struct rpc_task *task) { /* Initialize retry counters */ task->tk_garb_retry = 2; task->tk_cred_retry = 2; /* starting timestamp */ task->tk_start = ktime_get(); } static void rpc_reset_task_statistics(struct rpc_task *task) { task->tk_timeouts = 0; task->tk_flags &= ~(RPC_CALL_MAJORSEEN|RPC_TASK_SENT); rpc_init_task_statistics(task); } /* * Helper that calls task->tk_ops->rpc_call_done if it exists */ void rpc_exit_task(struct rpc_task *task) { trace_rpc_task_end(task, task->tk_action); task->tk_action = NULL; if (task->tk_ops->rpc_count_stats) task->tk_ops->rpc_count_stats(task, task->tk_calldata); else if (task->tk_client) rpc_count_iostats(task, task->tk_client->cl_metrics); if (task->tk_ops->rpc_call_done != NULL) { trace_rpc_task_call_done(task, task->tk_ops->rpc_call_done); task->tk_ops->rpc_call_done(task, task->tk_calldata); if (task->tk_action != NULL) { /* Always release the RPC slot and buffer memory */ xprt_release(task); rpc_reset_task_statistics(task); } } } void rpc_signal_task(struct rpc_task *task) { struct rpc_wait_queue *queue; if (!RPC_IS_ACTIVATED(task)) return; if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) return; trace_rpc_task_signalled(task, task->tk_action); queue = READ_ONCE(task->tk_waitqueue); if (queue) rpc_wake_up_queued_task(queue, task); } void rpc_task_try_cancel(struct rpc_task *task, int error) { struct rpc_wait_queue *queue; if (!rpc_task_set_rpc_status(task, error)) return; queue = READ_ONCE(task->tk_waitqueue); if (queue) rpc_wake_up_queued_task(queue, task); } void rpc_exit(struct rpc_task *task, int status) { task->tk_status = status; task->tk_action = rpc_exit_task; rpc_wake_up_queued_task(task->tk_waitqueue, task); } EXPORT_SYMBOL_GPL(rpc_exit); void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata) { if (ops->rpc_release != NULL) ops->rpc_release(calldata); } static bool xprt_needs_memalloc(struct rpc_xprt *xprt, struct rpc_task *tk) { if (!xprt) return false; if (!atomic_read(&xprt->swapper)) return false; return test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == tk; } /* * This is the RPC `scheduler' (or rather, the finite state machine). */ static void __rpc_execute(struct rpc_task *task) { struct rpc_wait_queue *queue; int task_is_async = RPC_IS_ASYNC(task); int status = 0; unsigned long pflags = current->flags; WARN_ON_ONCE(RPC_IS_QUEUED(task)); if (RPC_IS_QUEUED(task)) return; for (;;) { void (*do_action)(struct rpc_task *); /* * Perform the next FSM step or a pending callback. * * tk_action may be NULL if the task has been killed. */ do_action = task->tk_action; /* Tasks with an RPC error status should exit */ if (do_action && do_action != rpc_exit_task && (status = READ_ONCE(task->tk_rpc_status)) != 0) { task->tk_status = status; do_action = rpc_exit_task; } /* Callbacks override all actions */ if (task->tk_callback) { do_action = task->tk_callback; task->tk_callback = NULL; } if (!do_action) break; if (RPC_IS_SWAPPER(task) || xprt_needs_memalloc(task->tk_xprt, task)) current->flags |= PF_MEMALLOC; trace_rpc_task_run_action(task, do_action); do_action(task); /* * Lockless check for whether task is sleeping or not. */ if (!RPC_IS_QUEUED(task)) { cond_resched(); continue; } /* * The queue->lock protects against races with * rpc_make_runnable(). * * Note that once we clear RPC_TASK_RUNNING on an asynchronous * rpc_task, rpc_make_runnable() can assign it to a * different workqueue. We therefore cannot assume that the * rpc_task pointer may still be dereferenced. */ queue = task->tk_waitqueue; spin_lock(&queue->lock); if (!RPC_IS_QUEUED(task)) { spin_unlock(&queue->lock); continue; } /* Wake up any task that has an exit status */ if (READ_ONCE(task->tk_rpc_status) != 0) { rpc_wake_up_task_queue_locked(queue, task); spin_unlock(&queue->lock); continue; } rpc_clear_running(task); spin_unlock(&queue->lock); if (task_is_async) goto out; /* sync task: sleep here */ trace_rpc_task_sync_sleep(task, task->tk_action); status = out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_QUEUED, rpc_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE); if (status < 0) { /* * When a sync task receives a signal, it exits with * -ERESTARTSYS. In order to catch any callbacks that * clean up after sleeping on some queue, we don't * break the loop here, but go around once more. */ rpc_signal_task(task); } trace_rpc_task_sync_wake(task, task->tk_action); } /* Release all resources associated with the task */ rpc_release_task(task); out: current_restore_flags(pflags, PF_MEMALLOC); } /* * User-visible entry point to the scheduler. * * This may be called recursively if e.g. an async NFS task updates * the attributes and finds that dirty pages must be flushed. * NOTE: Upon exit of this function the task is guaranteed to be * released. In particular note that tk_release() will have * been called, so your task memory may have been freed. */ void rpc_execute(struct rpc_task *task) { bool is_async = RPC_IS_ASYNC(task); rpc_set_active(task); rpc_make_runnable(rpciod_workqueue, task); if (!is_async) { unsigned int pflags = memalloc_nofs_save(); __rpc_execute(task); memalloc_nofs_restore(pflags); } } static void rpc_async_schedule(struct work_struct *work) { unsigned int pflags = memalloc_nofs_save(); __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); memalloc_nofs_restore(pflags); } /** * rpc_malloc - allocate RPC buffer resources * @task: RPC task * * A single memory region is allocated, which is split between the * RPC call and RPC reply that this task is being used for. When * this RPC is retired, the memory is released by calling rpc_free. * * To prevent rpciod from hanging, this allocator never sleeps, * returning -ENOMEM and suppressing warning if the request cannot * be serviced immediately. The caller can arrange to sleep in a * way that is safe for rpciod. * * Most requests are 'small' (under 2KiB) and can be serviced from a * mempool, ensuring that NFS reads and writes can always proceed, * and that there is good locality of reference for these buffers. */ int rpc_malloc(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; size_t size = rqst->rq_callsize + rqst->rq_rcvsize; struct rpc_buffer *buf; gfp_t gfp = rpc_task_gfp_mask(); size += sizeof(struct rpc_buffer); if (size <= RPC_BUFFER_MAXSIZE) { buf = kmem_cache_alloc(rpc_buffer_slabp, gfp); /* Reach for the mempool if dynamic allocation fails */ if (!buf && RPC_IS_ASYNC(task)) buf = mempool_alloc(rpc_buffer_mempool, GFP_NOWAIT); } else buf = kmalloc(size, gfp); if (!buf) return -ENOMEM; buf->len = size; rqst->rq_buffer = buf->data; rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } /** * rpc_free - free RPC buffer resources allocated via rpc_malloc * @task: RPC task * */ void rpc_free(struct rpc_task *task) { void *buffer = task->tk_rqstp->rq_buffer; size_t size; struct rpc_buffer *buf; buf = container_of(buffer, struct rpc_buffer, data); size = buf->len; if (size <= RPC_BUFFER_MAXSIZE) mempool_free(buf, rpc_buffer_mempool); else kfree(buf); } /* * Creation and deletion of RPC task structures */ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data) { memset(task, 0, sizeof(*task)); atomic_set(&task->tk_count, 1); task->tk_flags = task_setup_data->flags; task->tk_ops = task_setup_data->callback_ops; task->tk_calldata = task_setup_data->callback_data; INIT_LIST_HEAD(&task->tk_task); task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW; task->tk_owner = current->tgid; /* Initialize workqueue for async tasks */ task->tk_workqueue = task_setup_data->workqueue; task->tk_xprt = rpc_task_get_xprt(task_setup_data->rpc_client, xprt_get(task_setup_data->rpc_xprt)); task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred); if (task->tk_ops->rpc_call_prepare != NULL) task->tk_action = rpc_prepare_task; rpc_init_task_statistics(task); } static struct rpc_task *rpc_alloc_task(void) { struct rpc_task *task; task = kmem_cache_alloc(rpc_task_slabp, rpc_task_gfp_mask()); if (task) return task; return mempool_alloc(rpc_task_mempool, GFP_NOWAIT); } /* * Create a new task for the specified client. */ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data) { struct rpc_task *task = setup_data->task; unsigned short flags = 0; if (task == NULL) { task = rpc_alloc_task(); if (task == NULL) { rpc_release_calldata(setup_data->callback_ops, setup_data->callback_data); return ERR_PTR(-ENOMEM); } flags = RPC_TASK_DYNAMIC; } rpc_init_task(task, setup_data); task->tk_flags |= flags; return task; } /* * rpc_free_task - release rpc task and perform cleanups * * Note that we free up the rpc_task _after_ rpc_release_calldata() * in order to work around a workqueue dependency issue. * * Tejun Heo states: * "Workqueue currently considers two work items to be the same if they're * on the same address and won't execute them concurrently - ie. it * makes a work item which is queued again while being executed wait * for the previous execution to complete. * * If a work function frees the work item, and then waits for an event * which should be performed by another work item and *that* work item * recycles the freed work item, it can create a false dependency loop. * There really is no reliable way to detect this short of verifying * every memory free." * */ static void rpc_free_task(struct rpc_task *task) { unsigned short tk_flags = task->tk_flags; put_rpccred(task->tk_op_cred); rpc_release_calldata(task->tk_ops, task->tk_calldata); if (tk_flags & RPC_TASK_DYNAMIC) mempool_free(task, rpc_task_mempool); } static void rpc_async_release(struct work_struct *work) { unsigned int pflags = memalloc_nofs_save(); rpc_free_task(container_of(work, struct rpc_task, u.tk_work)); memalloc_nofs_restore(pflags); } static void rpc_release_resources_task(struct rpc_task *task) { xprt_release(task); if (task->tk_msg.rpc_cred) { if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) put_cred(task->tk_msg.rpc_cred); task->tk_msg.rpc_cred = NULL; } rpc_task_release_client(task); } static void rpc_final_put_task(struct rpc_task *task, struct workqueue_struct *q) { if (q != NULL) { INIT_WORK(&task->u.tk_work, rpc_async_release); queue_work(q, &task->u.tk_work); } else rpc_free_task(task); } static void rpc_do_put_task(struct rpc_task *task, struct workqueue_struct *q) { if (atomic_dec_and_test(&task->tk_count)) { rpc_release_resources_task(task); rpc_final_put_task(task, q); } } void rpc_put_task(struct rpc_task *task) { rpc_do_put_task(task, NULL); } EXPORT_SYMBOL_GPL(rpc_put_task); void rpc_put_task_async(struct rpc_task *task) { rpc_do_put_task(task, task->tk_workqueue); } EXPORT_SYMBOL_GPL(rpc_put_task_async); static void rpc_release_task(struct rpc_task *task) { WARN_ON_ONCE(RPC_IS_QUEUED(task)); rpc_release_resources_task(task); /* * Note: at this point we have been removed from rpc_clnt->cl_tasks, * so it should be safe to use task->tk_count as a test for whether * or not any other processes still hold references to our rpc_task. */ if (atomic_read(&task->tk_count) != 1 + !RPC_IS_ASYNC(task)) { /* Wake up anyone who may be waiting for task completion */ if (!rpc_complete_task(task)) return; } else { if (!atomic_dec_and_test(&task->tk_count)) return; } rpc_final_put_task(task, task->tk_workqueue); } int rpciod_up(void) { return try_module_get(THIS_MODULE) ? 0 : -EINVAL; } void rpciod_down(void) { module_put(THIS_MODULE); } /* * Start up the rpciod workqueue. */ static int rpciod_start(void) { struct workqueue_struct *wq; /* * Create the rpciod thread and wait for it to start. */ wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!wq) goto out_failed; rpciod_workqueue = wq; wq = alloc_workqueue("xprtiod", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!wq) goto free_rpciod; xprtiod_workqueue = wq; return 1; free_rpciod: wq = rpciod_workqueue; rpciod_workqueue = NULL; destroy_workqueue(wq); out_failed: return 0; } static void rpciod_stop(void) { struct workqueue_struct *wq = NULL; if (rpciod_workqueue == NULL) return; wq = rpciod_workqueue; rpciod_workqueue = NULL; destroy_workqueue(wq); wq = xprtiod_workqueue; xprtiod_workqueue = NULL; destroy_workqueue(wq); } void rpc_destroy_mempool(void) { rpciod_stop(); mempool_destroy(rpc_buffer_mempool); mempool_destroy(rpc_task_mempool); kmem_cache_destroy(rpc_task_slabp); kmem_cache_destroy(rpc_buffer_slabp); rpc_destroy_wait_queue(&delay_queue); } int rpc_init_mempool(void) { /* * The following is not strictly a mempool initialisation, * but there is no harm in doing it here */ rpc_init_wait_queue(&delay_queue, "delayq"); if (!rpciod_start()) goto err_nomem; rpc_task_slabp = kmem_cache_create("rpc_tasks", sizeof(struct rpc_task), 0, SLAB_HWCACHE_ALIGN, NULL); if (!rpc_task_slabp) goto err_nomem; rpc_buffer_slabp = kmem_cache_create("rpc_buffers", RPC_BUFFER_MAXSIZE, 0, SLAB_HWCACHE_ALIGN, NULL); if (!rpc_buffer_slabp) goto err_nomem; rpc_task_mempool = mempool_create_slab_pool(RPC_TASK_POOLSIZE, rpc_task_slabp); if (!rpc_task_mempool) goto err_nomem; rpc_buffer_mempool = mempool_create_slab_pool(RPC_BUFFER_POOLSIZE, rpc_buffer_slabp); if (!rpc_buffer_mempool) goto err_nomem; return 0; err_nomem: rpc_destroy_mempool(); return -ENOMEM; } |
| 72 4 3 38 38 38 40 3 3 38 3 3 70 8 8 8 61 52 53 53 52 53 53 53 31 22 22 4 4 4 4 22 18 18 1 7 1 7 60 1 1 60 1 60 1 1 1 1 1 43 43 1 43 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 28 60 60 11 11 59 11 8 4 57 59 57 49 60 4 5 5 2 2 2 2 3 3 50 2 2 1 2 2 50 1 1 1 3 3 3 3 2 2 2 2 2 1 38 1 3 2 38 38 1 1 37 50 50 1 50 50 49 50 50 49 50 50 4 1 1 1 1 1 1 1 2 1 1 4 3 4 1 2 71 67 64 63 62 61 62 61 63 61 1 2 1 1 1 1 2 1 1 71 62 62 62 62 61 62 62 62 62 62 79 79 79 78 79 78 79 79 79 79 64 64 78 79 79 35 43 2 43 43 1 43 43 43 42 43 43 43 43 43 43 43 43 43 43 42 43 43 43 42 43 43 43 41 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 43 11 43 42 43 43 43 43 43 43 43 43 51 51 51 50 48 50 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 37 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 37 38 38 37 3 38 38 37 38 38 1 1 38 38 38 38 37 38 1 1 38 38 38 38 38 38 38 38 38 36 36 36 36 36 1 36 36 60 60 61 60 60 59 60 60 60 60 60 60 60 59 60 60 60 59 56 56 8 8 8 7 7 1 3 3 3 3 2 4 2 1 1 49 63 60 56 63 49 43 43 43 49 49 48 49 46 47 46 46 46 46 46 45 45 44 44 44 44 44 44 44 43 43 43 43 43 43 43 43 43 40 2 1 1 1 1 38 43 43 42 43 40 41 41 40 40 2 38 40 38 2 38 37 37 38 38 2 2 38 38 38 43 43 43 43 43 43 1 43 9 11 2 78 2 43 43 43 43 1 43 43 43 43 43 43 43 43 43 8 38 38 5 36 36 2 35 34 34 35 35 43 5 12 12 68 68 68 64 64 64 63 1 62 62 61 61 61 62 49 49 7 43 35 35 12 12 14 29 29 29 29 8 2 16 8 1 1 1 1 7 7 14 44 44 43 42 42 8 1 7 1 1 43 40 40 40 40 40 4 43 40 43 13 2 1 1 1 42 43 13 2 2 43 43 7 42 1 7 42 42 42 1 1 41 1 1 43 65 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 64 11 11 11 29 29 29 29 29 29 29 29 29 29 29 29 28 29 29 29 29 29 29 29 27 27 16 16 16 16 1 38 38 38 37 38 38 38 38 37 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 2 37 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 31 31 31 31 31 31 31 28 27 28 3 3 31 31 31 31 31 31 31 3 31 31 31 31 31 30 30 1 31 27 28 28 31 3 3 72 2 69 72 3 69 4 72 72 71 3 69 4 72 65 65 65 7 7 1 7 7 7 1 3 72 68 68 2 67 68 2 66 66 66 66 66 66 66 66 66 66 66 1 66 66 2 66 2 65 64 64 63 63 62 62 1 62 62 4 62 4 4 4 4 2 65 65 64 2 62 65 62 1 61 61 4 60 60 68 1 68 68 68 68 68 68 68 68 68 67 1 67 66 1 66 8 66 66 63 63 64 65 65 65 65 64 65 64 63 1 63 60 2 2 63 63 9 63 63 63 63 57 63 63 63 58 58 58 58 3 63 63 62 63 62 63 63 66 66 65 2 64 66 64 1 63 63 62 1 62 62 62 62 62 1 62 63 3 3 3 3 3 2 2 1 1 1 1 1 1 1 1 1 1 1 6 6 2 6 6 4 4 4 3 3 1 2 3 1 1 1 1 1 1 1 5 5 3 3 3 3 3 1 2 1 1 2 2 6 6 6 2 2 5 1 4 4 6 40 24 6 6 5 5 1 14 2 3 5 3 43 43 3 40 40 43 40 25 25 2 2 2 25 28 8 7 5 4 4 4 2 4 4 4 2 2 2 7 4 4 3 1 2 2 1 1 1 3 3 1 1 8 5 1 22 9 8 8 8 7 7 7 6 6 5 1 1 4 4 4 4 4 3 1 1 1 1 1 1 1 1 1 225 225 225 225 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 | // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/objtool.h> #include <linux/percpu.h> #include <asm/debugreg.h> #include <asm/mmu_context.h> #include <asm/msr.h> #include "x86.h" #include "cpuid.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" #include "pmu.h" #include "posted_intr.h" #include "sgx.h" #include "trace.h" #include "vmx.h" #include "smm.h" #include "x86_ops.h" static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __ro_after_init warn_on_missed_cc; module_param(warn_on_missed_cc, bool, 0444); #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. */ #define VMX_VPID_EXTENT_SUPPORTED_MASK \ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 enum { VMX_VMREAD_BITMAP, VMX_VMWRITE_BITMAP, VMX_BITMAP_NR }; static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) struct shadow_vmcs_field { u16 encoding; u16 offset; }; static struct shadow_vmcs_field shadow_read_only_fields[] = { #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, #include "vmcs_shadow_fields.h" }; static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); static struct shadow_vmcs_field shadow_read_write_fields[] = { #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, #include "vmcs_shadow_fields.h" }; static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); static void init_vmcs_shadow_fields(void) { int i, j; memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); for (i = j = 0; i < max_shadow_read_only_fields; i++) { struct shadow_vmcs_field entry = shadow_read_only_fields[i]; u16 field = entry.encoding; if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_only_fields || shadow_read_only_fields[i + 1].encoding != field + 1)) pr_err("Missing field from shadow_read_only_field %x\n", field + 1); clear_bit(field, vmx_vmread_bitmap); if (field & 1) #ifdef CONFIG_X86_64 continue; #else entry.offset += sizeof(u32); #endif shadow_read_only_fields[j++] = entry; } max_shadow_read_only_fields = j; for (i = j = 0; i < max_shadow_read_write_fields; i++) { struct shadow_vmcs_field entry = shadow_read_write_fields[i]; u16 field = entry.encoding; if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_write_fields || shadow_read_write_fields[i + 1].encoding != field + 1)) pr_err("Missing field from shadow_read_write_field %x\n", field + 1); WARN_ONCE(field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES, "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); /* * PML and the preemption timer can be emulated, but the * processor cannot vmwrite to fields that don't exist * on bare metal. */ switch (field) { case GUEST_PML_INDEX: if (!cpu_has_vmx_pml()) continue; break; case VMX_PREEMPTION_TIMER_VALUE: if (!cpu_has_vmx_preemption_timer()) continue; break; case GUEST_INTR_STATUS: if (!cpu_has_vmx_apicv()) continue; break; default: break; } clear_bit(field, vmx_vmwrite_bitmap); clear_bit(field, vmx_vmread_bitmap); if (field & 1) #ifdef CONFIG_X86_64 continue; #else entry.offset += sizeof(u32); #endif shadow_read_write_fields[j++] = entry; } max_shadow_read_write_fields = j; } /* * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), * set the success or error code of an emulated VMX instruction (as specified * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated * instruction. */ static int nested_vmx_succeed(struct kvm_vcpu *vcpu) { vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) { vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)) | X86_EFLAGS_CF); return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_failValid(struct kvm_vcpu *vcpu, u32 vm_instruction_error) { vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_SF | X86_EFLAGS_OF)) | X86_EFLAGS_ZF); get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; /* * We don't need to force sync to shadow VMCS because * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all * fields and thus must be synced. */ if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * failValid writes the error number to the current VMCS, which * can't be done if there isn't a current VMCS. */ if (vmx->nested.current_vmptr == INVALID_GPA && !nested_vmx_is_evmptr12_valid(vmx)) return nested_vmx_failInvalid(vcpu); return nested_vmx_failValid(vcpu, vm_instruction_error); } static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) { return fixed_bits_valid(control, low, high); } static inline u64 vmx_control_msr(u32 low, u32 high) { return low | ((u64)high << 32); } static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); vmx->nested.need_vmcs12_to_shadow_sync = false; } static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) { #ifdef CONFIG_KVM_HYPERV struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); vmx->nested.hv_evmcs = NULL; vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; if (hv_vcpu) { hv_vcpu->nested.pa_page_gpa = INVALID_GPA; hv_vcpu->nested.vm_id = 0; hv_vcpu->nested.vp_id = 0; } #endif } static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) { #ifdef CONFIG_KVM_HYPERV struct vcpu_vmx *vmx = to_vmx(vcpu); /* * When Enlightened VMEntry is enabled on the calling CPU we treat * memory area pointer by vmptr as Enlightened VMCS (as there's no good * way to distinguish it from VMCS12) and we must not corrupt it by * writing to the non-existent 'launch_state' field. The area doesn't * have to be the currently active EVMCS on the calling CPU and there's * nothing KVM has to do to transition it from 'active' to 'non-active' * state. It is possible that the area will stay mapped as * vmx->nested.hv_evmcs but this shouldn't be a problem. */ if (!guest_cpu_cap_has_evmcs(vcpu) || !evmptr_is_valid(nested_get_evmptr(vcpu))) return false; if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) nested_release_evmcs(vcpu); return true; #else return false; #endif } static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, struct loaded_vmcs *prev) { struct vmcs_host_state *dest, *src; if (unlikely(!vmx->vt.guest_state_loaded)) return; src = &prev->host_state; dest = &vmx->loaded_vmcs->host_state; vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); dest->ldt_sel = src->ldt_sel; #ifdef CONFIG_X86_64 dest->ds_sel = src->ds_sel; dest->es_sel = src->es_sel; #endif } static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct loaded_vmcs *prev; int cpu; if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) return; cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; vmx_vcpu_load_vmcs(vcpu, cpu); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; /* * All lazily updated registers will be reloaded from VMCS12 on both * vmentry and vmexit. */ vcpu->arch.regs_dirty = 0; } static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); vmx->nested.pi_desc = NULL; } /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. */ static void free_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) vmx_switch_vmcs(vcpu, &vmx->vmcs01); if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; vmx->nested.vmxon_ptr = INVALID_GPA; free_vpid(vmx->nested.vpid02); vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = INVALID_GPA; if (enable_shadow_vmcs) { vmx_disable_shadow_vmcs(vmx); vmcs_clear(vmx->vmcs01.shadow_vmcs); free_vmcs(vmx->vmcs01.shadow_vmcs); vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; nested_put_vmcs12_pages(vcpu); kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); nested_release_evmcs(vcpu); free_loaded_vmcs(&vmx->nested.vmcs02); } /* * Ensure that the current vmcs of the logical processor is the * vmcs01 of the vcpu before calling free_nested(). */ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); vmx_leave_nested(vcpu); vcpu_put(vcpu); } #define EPTP_PA_MASK GENMASK_ULL(51, 12) static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) { return VALID_PAGE(root_hpa) && ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); } static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, gpa_t addr) { unsigned long roots = 0; uint i; struct kvm_mmu_root_info *cached_root; WARN_ON_ONCE(!mmu_is_nested(vcpu)); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { cached_root = &vcpu->arch.mmu->prev_roots[i]; if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, eptp)) roots |= KVM_MMU_ROOT_PREVIOUS(i); } if (roots) kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); } static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; u32 vm_exit_reason; if (vmx->nested.pml_full) { vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; /* * It should be impossible to trigger a nested PML Full VM-Exit * for anything other than an EPT Violation from L2. KVM *can* * trigger nEPT page fault injection in response to an EPT * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT * tables also changed, but KVM should not treat EPT Misconfig * VM-Exits as writes. */ WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); /* * PML Full and EPT Violation VM-Exits both use bit 12 to report * "NMI unblocking due to IRET", i.e. the bit can be propagated * as-is from the original EXIT_QUALIFICATION. */ exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; } else { if (fault->error_code & PFERR_RSVD_MASK) { vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; exit_qualification = 0; } else { exit_qualification = fault->exit_qualification; exit_qualification |= vmx_get_exit_qual(vcpu) & (EPT_VIOLATION_GVA_IS_VALID | EPT_VIOLATION_GVA_TRANSLATED); vm_exit_reason = EXIT_REASON_EPT_VIOLATION; } /* * Although the caller (kvm_inject_emulated_page_fault) would * have already synced the faulting address in the shadow EPT * tables for the current EPTP12, we also need to sync it for * any other cached EPTP02s based on the same EP4TA, since the * TLB associates mappings to the EP4TA rather than the full EPTP. */ nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, fault->address); } nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; } static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, nested_ept_ad_enabled(vcpu), nested_ept_get_eptp(vcpu)); } static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; nested_ept_new_eptp(vcpu); vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) { vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, u16 error_code) { bool inequality, bit; bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; inequality = (error_code & vmcs12->page_fault_error_code_mask) != vmcs12->page_fault_error_code_match; return inequality ^ bit; } static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, u32 error_code) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* * Drop bits 31:16 of the error code when performing the #PF mask+match * check. All VMCS fields involved are 32 bits, but Intel CPUs never * set bits 31:16 and VMX disallows setting bits 31:16 in the injected * error code. Including the to-be-dropped bits in the check might * result in an "impossible" or missed exit from L1's perspective. */ if (vector == PF_VECTOR) return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); return (vmcs12->exception_bitmap & (1u << vector)); } static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return 0; if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) return -EINVAL; return 0; } static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 0; if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) return -EINVAL; return 0; } static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return 0; if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) return -EINVAL; if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) return -EINVAL; return 0; } /* * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, * only the "disable intercept" case needs to be handled. */ static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, unsigned long *msr_bitmap_l0, u32 msr, int type) { if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); } static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { int msr; for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { unsigned word = msr / BITS_PER_LONG; msr_bitmap[word] = ~0; msr_bitmap[word + (0x800 / sizeof(long))] = ~0; } } #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ static inline \ void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ unsigned long *msr_bitmap_l1, \ unsigned long *msr_bitmap_l0, u32 msr) \ { \ if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ else \ vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ } BUILD_NVMX_MSR_INTERCEPT_HELPER(read) BUILD_NVMX_MSR_INTERCEPT_HELPER(write) static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, unsigned long *msr_bitmap_l1, unsigned long *msr_bitmap_l0, u32 msr, int types) { if (types & MSR_TYPE_R) nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, msr_bitmap_l0, msr); if (types & MSR_TYPE_W) nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, msr_bitmap_l0, msr); } /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. */ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; struct kvm_host_map map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return false; /* * MSR bitmap update can be skipped when: * - MSR bitmap for L1 hasn't changed. * - Nested hypervisor (L1) is attempting to launch the same L2 as * before. * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature * and tells KVM (L0) there were no changes in MSR bitmap for L2. */ if (!vmx->nested.force_msr_bitmap_recalc) { struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) return true; } if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) return false; msr_bitmap_l1 = (unsigned long *)map.hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen * 4-byte writes on 32-bit systems) up front to enable intercepts for * the x2APIC MSR range and selectively toggle those relevant to L2. */ enable_x2apic_msr_intercepts(msr_bitmap_l0); if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { if (nested_cpu_has_apic_reg_virt(vmcs12)) { /* * L0 need not intercept reads for MSRs between 0x800 * and 0x8ff, it just lets the processor take the value * from the virtual-APIC page; take those 256 bits * directly from the L1 bitmap. */ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { unsigned word = msr / BITS_PER_LONG; msr_bitmap_l0[word] = msr_bitmap_l1[word]; } } nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W); if (nested_cpu_has_vid(vmcs12)) { nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } /* * Always check vmcs01's bitmap to honor userspace MSR filters and any * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. */ #ifdef CONFIG_X86_64 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_FS_BASE, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_GS_BASE, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); #endif nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PRED_CMD, MSR_TYPE_W); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_APERF, MSR_TYPE_R); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_MPERF, MSR_TYPE_R); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_U_CET, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_S_CET, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL0_SSP, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL1_SSP, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL2_SSP, MSR_TYPE_RW); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL3_SSP, MSR_TYPE_RW); kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; return true; } static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == INVALID_GPA) return; if (ghc->gpa != vmcs12->vmcs_link_pointer && kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == INVALID_GPA) return; if (ghc->gpa != vmcs12->vmcs_link_pointer && kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } /* * In nested virtualization, check if L1 has set * VM_EXIT_ACK_INTR_ON_EXIT */ static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) { return get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_ACK_INTR_ON_EXIT; } static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) return -EINVAL; else return 0; } static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && !nested_cpu_has_apic_reg_virt(vmcs12) && !nested_cpu_has_vid(vmcs12) && !nested_cpu_has_posted_intr(vmcs12)) return 0; /* * If virtualize x2apic mode is enabled, * virtualize apic access must be disabled. */ if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) return -EINVAL; /* * If virtual interrupt delivery is enabled, * we must exit on external interrupts. */ if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) return -EINVAL; /* * bits 15:8 should be zero in posted_intr_nv, * the descriptor address has been already checked * in nested_get_vmcs12_pages. * * bits 5:0 of posted_intr_desc_addr should be zero. */ if (nested_cpu_has_posted_intr(vmcs12) && (CC(!nested_cpu_has_vid(vmcs12)) || CC(!nested_exit_intr_ack_set(vcpu)) || CC((vmcs12->posted_intr_nv & 0xff00)) || CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) return -EINVAL; return 0; } static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, vmx->nested.msrs.misc_high); return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; } static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { if (count == 0) return 0; /* * Exceeding the limit results in architecturally _undefined_ behavior, * i.e. KVM is allowed to do literally anything in response to a bad * limit. Immediately generate a consistency check so that code that * consumes the count doesn't need to worry about extreme edge cases. */ if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) return -EINVAL; if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; return 0; } static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (CC(nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, vmcs12->vm_exit_msr_load_addr)) || CC(nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, vmcs12->vm_exit_msr_store_addr))) return -EINVAL; return 0; } static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (CC(nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, vmcs12->vm_entry_msr_load_addr))) return -EINVAL; return 0; } static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has_pml(vmcs12)) return 0; if (CC(!nested_cpu_has_ept(vmcs12)) || CC(!page_address_valid(vcpu, vmcs12->pml_address))) return -EINVAL; return 0; } static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && !nested_cpu_has_ept(vmcs12))) return -EINVAL; return 0; } static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && !nested_cpu_has_ept(vmcs12))) return -EINVAL; return 0; } static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (!nested_cpu_has_shadow_vmcs(vmcs12)) return 0; if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) return -EINVAL; return 0; } static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { /* x2APIC MSR accesses are not allowed */ if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) return -EINVAL; if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ CC(e->index == MSR_IA32_UCODE_REV)) return -EINVAL; if (CC(e->reserved != 0)) return -EINVAL; return 0; } static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { if (CC(e->index == MSR_FS_BASE) || CC(e->index == MSR_GS_BASE) || CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ nested_vmx_msr_check_common(vcpu, e)) return -EINVAL; return 0; } static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ nested_vmx_msr_check_common(vcpu, e)) return -EINVAL; return 0; } /* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. * * One of the failure modes for MSR load/store is when a list exceeds the * virtual hardware's capacity. To maintain compatibility with hardware inasmuch * as possible, process all valid entries before failing rather than precheck * for a capacity violation. */ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u32 i; struct vmx_msr_entry e; u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { if (WARN_ON_ONCE(i >= max_msr_list_size)) goto fail; if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, sizeof(e))) { pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); goto fail; } if (nested_vmx_load_msr_check(vcpu, &e)) { pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e.index, e.reserved); goto fail; } if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); goto fail; } } return 0; fail: /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ return i + 1; } static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * If the L0 hypervisor stored a more accurate value for the TSC that * does not include the time taken for emulation of the L2->L1 * VM-exit in L0, use the more accurate value. */ if (msr_index == MSR_IA32_TSC) { int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, MSR_IA32_TSC); if (i >= 0) { u64 val = vmx->msr_autostore.guest.val[i].value; *data = kvm_read_l1_tsc(vcpu, val); return true; } } if (kvm_emulate_msr_read(vcpu, msr_index, data)) { pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, msr_index); return false; } return true; } static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, struct vmx_msr_entry *e) { if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(*e), e, 2 * sizeof(u32))) { pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(*e)); return false; } if (nested_vmx_store_msr_check(vcpu, e)) { pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e->index, e->reserved); return false; } return true; } static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u64 data; u32 i; struct vmx_msr_entry e; u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { if (WARN_ON_ONCE(i >= max_msr_list_size)) return -EINVAL; if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) return -EINVAL; if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) return -EINVAL; if (kvm_vcpu_write_guest(vcpu, gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), &data, sizeof(data))) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, data); return -EINVAL; } } return 0; } static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 count = vmcs12->vm_exit_msr_store_count; u64 gpa = vmcs12->vm_exit_msr_store_addr; struct vmx_msr_entry e; u32 i; for (i = 0; i < count; i++) { if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) return false; if (e.index == msr_index) return true; } return false; } static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, u32 msr_index) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_msrs *autostore = &vmx->msr_autostore.guest; bool in_vmcs12_store_list; int msr_autostore_slot; bool in_autostore_list; int last; msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); in_autostore_list = msr_autostore_slot >= 0; in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); if (in_vmcs12_store_list && !in_autostore_list) { if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by * nested_vmx_get_vmexit_msr_value() by reading KVM's * internal MSR state instead of reading the value from * the vmcs02 VMExit MSR-store area. */ pr_warn_ratelimited( "Not enough msr entries in msr_autostore. Can't add msr %x\n", msr_index); return; } last = autostore->nr++; autostore->val[last].index = msr_index; } else if (!in_vmcs12_store_list && in_autostore_list) { last = --autostore->nr; autostore->val[msr_autostore_slot] = autostore->val[last]; } } /* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to * @entry_failure_code. */ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, bool reload_pdptrs, enum vm_entry_failure_code *entry_failure_code) { if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } /* * If PAE paging and EPT are both on, CR3 is not used by the CPU and * must not be dereferenced. */ if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && CC(!load_pdptrs(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL; } vcpu->arch.cr3 = cr3; kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); if (!nested_ept) kvm_mmu_new_pgd(vcpu, cr3); return 0; } /* * Returns if KVM is able to config CPU to tag TLB entries * populated by L2 differently than TLB entries populated * by L1. * * If L0 uses EPT, L1 and L2 run with different EPTP because * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries * are tagged with different EPTP. * * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged * with different VPID (L1 entries are tagged with vmx->vpid * while L2 entries are tagged with vmx->nested.vpid02). */ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); return enable_ept || (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); } static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool is_vmenter) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* Handle pending Hyper-V TLB flush requests */ kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); /* * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the * same VPID as the host, and so architecturally, linear and combined * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This * is required if VPID is disabled in KVM, as a TLB flush (there are no * VPIDs) still occurs from L1's perspective, and KVM may need to * synchronize the MMU in response to the guest TLB flush. * * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. * EPT is a special snowflake, as guest-physical mappings aren't * flushed on VPID invalidations, including VM-Enter or VM-Exit with * VPID disabled. As a result, KVM _never_ needs to sync nEPT * entries on VM-Enter because L1 can't rely on VM-Enter to flush * those mappings. */ if (!nested_cpu_has_vpid(vmcs12)) { kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return; } /* L2 should never have a VPID if VPID is disabled. */ WARN_ON(!enable_vpid); /* * VPID is enabled and in use by vmcs12. If vpid12 is changing, then * emulate a guest TLB flush as KVM does not track vpid12 history nor * is the VPID incorporated into the MMU context. I.e. KVM must assume * that the new vpid12 has never been used and thus represents a new * guest ASID that cannot have entries in the TLB. */ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return; } /* * If VPID is enabled, used by vmc12, and vpid12 is not changing but * does not have a unique TLB tag (ASID), i.e. EPT is disabled and * KVM was unable to allocate a VPID for L2, flush the current context * as the effective ASID is common to both L1 and L2. */ if (!nested_has_guest_tlb_tag(vcpu)) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) { superset &= mask; subset &= mask; return (superset | subset) == superset; } static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) { const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | VMX_BASIC_INOUT | VMX_BASIC_TRUE_CTLS | VMX_BASIC_NO_HW_ERROR_CODE_CC; const u64 reserved_bits = GENMASK_ULL(63, 57) | GENMASK_ULL(47, 45) | BIT_ULL(31); u64 vmx_basic = vmcs_config.nested.basic; BUILD_BUG_ON(feature_bits & reserved_bits); /* * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has * inverted polarity), the incoming value must not set feature bits or * reserved bits that aren't allowed/supported by KVM. Fields, i.e. * multi-bit values, are explicitly checked below. */ if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) return -EINVAL; /* * KVM does not emulate a version of VMX that constrains physical * addresses of VMX structures (e.g. VMCS) to 32-bits. */ if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EINVAL; if (vmx_basic_vmcs_revision_id(vmx_basic) != vmx_basic_vmcs_revision_id(data)) return -EINVAL; if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) return -EINVAL; vmx->nested.msrs.basic = data; return 0; } static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u32 **low, u32 **high) { switch (msr_index) { case MSR_IA32_VMX_TRUE_PINBASED_CTLS: *low = &msrs->pinbased_ctls_low; *high = &msrs->pinbased_ctls_high; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: *low = &msrs->procbased_ctls_low; *high = &msrs->procbased_ctls_high; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: *low = &msrs->exit_ctls_low; *high = &msrs->exit_ctls_high; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: *low = &msrs->entry_ctls_low; *high = &msrs->entry_ctls_high; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *low = &msrs->secondary_ctls_low; *high = &msrs->secondary_ctls_high; break; default: BUG(); } } static int vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) { u32 *lowp, *highp; u64 supported; vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); supported = vmx_control_msr(*lowp, *highp); /* Check must-be-1 bits are still 1. */ if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) return -EINVAL; /* Check must-be-0 bits are still 0. */ if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) return -EINVAL; vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); *lowp = data; *highp = data >> 32; return 0; } static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) { const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_SHUTDOWN | VMX_MISC_ACTIVITY_WAIT_SIPI | VMX_MISC_INTEL_PT | VMX_MISC_RDMSR_IN_SMM | VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_VMXOFF_BLOCK_SMI | VMX_MISC_ZERO_LEN_INS; const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, vmcs_config.nested.misc_high); BUILD_BUG_ON(feature_bits & reserved_bits); /* * The incoming value must not set feature bits or reserved bits that * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are * explicitly checked below. */ if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) return -EINVAL; if ((vmx->nested.msrs.pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) && vmx_misc_preemption_timer_rate(data) != vmx_misc_preemption_timer_rate(vmx_misc)) return -EINVAL; if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) return -EINVAL; if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) return -EINVAL; if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) return -EINVAL; vmx->nested.msrs.misc_low = data; vmx->nested.msrs.misc_high = data >> 32; return 0; } static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) { u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, vmcs_config.nested.vpid_caps); /* Every bit is either reserved or a feature bit. */ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) return -EINVAL; vmx->nested.msrs.ept_caps = data; vmx->nested.msrs.vpid_caps = data >> 32; return 0; } static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) { switch (msr_index) { case MSR_IA32_VMX_CR0_FIXED0: return &msrs->cr0_fixed0; case MSR_IA32_VMX_CR4_FIXED0: return &msrs->cr4_fixed0; default: BUG(); } } static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) { const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); /* * 1 bits (which indicates bits which "must-be-1" during VMX operation) * must be 1 in the restored value. */ if (!is_bitwise_subset(data, *msr, -1ULL)) return -EINVAL; *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; return 0; } /* * Called when userspace is restoring VMX MSRs. * * Returns 0 on success, non-0 otherwise. */ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Don't allow changes to the VMX capability MSRs while the vCPU * is in VMX operation. */ if (vmx->nested.vmxon) return -EBUSY; switch (msr_index) { case MSR_IA32_VMX_BASIC: return vmx_restore_vmx_basic(vmx, data); case MSR_IA32_VMX_PINBASED_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS: case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_ENTRY_CTLS: /* * The "non-true" VMX capability MSRs are generated from the * "true" MSRs, so we do not support restoring them directly. * * If userspace wants to emulate VMX_BASIC[55]=0, userspace * should restore the "true" MSRs with the must-be-1 bits * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND * DEFAULT SETTINGS". */ return -EINVAL; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS2: return vmx_restore_control_msr(vmx, msr_index, data); case MSR_IA32_VMX_MISC: return vmx_restore_vmx_misc(vmx, data); case MSR_IA32_VMX_CR0_FIXED0: case MSR_IA32_VMX_CR4_FIXED0: return vmx_restore_fixed0_msr(vmx, msr_index, data); case MSR_IA32_VMX_CR0_FIXED1: case MSR_IA32_VMX_CR4_FIXED1: /* * These MSRs are generated based on the vCPU's CPUID, so we * do not support restoring them directly. */ return -EINVAL; case MSR_IA32_VMX_EPT_VPID_CAP: return vmx_restore_vmx_ept_vpid_cap(vmx, data); case MSR_IA32_VMX_VMCS_ENUM: vmx->nested.msrs.vmcs_enum = data; return 0; case MSR_IA32_VMX_VMFUNC: if (data & ~vmcs_config.nested.vmfunc_controls) return -EINVAL; vmx->nested.msrs.vmfunc_controls = data; return 0; default: /* * The rest of the VMX capability MSRs do not support restore. */ return -EINVAL; } } /* Returns 0 on success, non-0 otherwise. */ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) { switch (msr_index) { case MSR_IA32_VMX_BASIC: *pdata = msrs->basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: *pdata = vmx_control_msr( msrs->pinbased_ctls_low, msrs->pinbased_ctls_high); if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr( msrs->procbased_ctls_low, msrs->procbased_ctls_high); if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr( msrs->exit_ctls_low, msrs->exit_ctls_high); if (msr_index == MSR_IA32_VMX_EXIT_CTLS) *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr( msrs->entry_ctls_low, msrs->entry_ctls_high); if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_MISC: *pdata = vmx_control_msr( msrs->misc_low, msrs->misc_high); break; case MSR_IA32_VMX_CR0_FIXED0: *pdata = msrs->cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1: *pdata = msrs->cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0: *pdata = msrs->cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1: *pdata = msrs->cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM: *pdata = msrs->vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr( msrs->secondary_ctls_low, msrs->secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: *pdata = msrs->ept_caps | ((u64)msrs->vpid_caps << 32); break; case MSR_IA32_VMX_VMFUNC: *pdata = msrs->vmfunc_controls; break; default: return 1; } return 0; } /* * Copy the writable VMCS shadow fields back to the VMCS12, in case they have * been modified by the L1 guest. Note, "writable" in this context means * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" * VM-exit information fields (which are actually writable if the vCPU is * configured to support "VMWRITE to any supported field in the VMCS"). */ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) { struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); struct shadow_vmcs_field field; unsigned long val; int i; if (WARN_ON(!shadow_vmcs)) return; preempt_disable(); vmcs_load(shadow_vmcs); for (i = 0; i < max_shadow_read_write_fields; i++) { field = shadow_read_write_fields[i]; val = __vmcs_readl(field.encoding); vmcs12_write_any(vmcs12, field.encoding, field.offset, val); } vmcs_clear(shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); preempt_enable(); } static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { const struct shadow_vmcs_field *fields[] = { shadow_read_write_fields, shadow_read_only_fields }; const int max_fields[] = { max_shadow_read_write_fields, max_shadow_read_only_fields }; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); struct shadow_vmcs_field field; unsigned long val; int i, q; if (WARN_ON(!shadow_vmcs)) return; vmcs_load(shadow_vmcs); for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; val = vmcs12_read_any(vmcs12, field.encoding, field.offset); __vmcs_writel(field.encoding, val); } } vmcs_clear(shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); } static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) { #ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ vmcs12->tpr_threshold = evmcs->tpr_threshold; vmcs12->guest_rip = evmcs->guest_rip; if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; hv_vcpu->nested.vm_id = evmcs->hv_vm_id; hv_vcpu->nested.vp_id = evmcs->hv_vp_id; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { vmcs12->guest_rsp = evmcs->guest_rsp; vmcs12->guest_rflags = evmcs->guest_rflags; vmcs12->guest_interruptibility_info = evmcs->guest_interruptibility_info; /* * Not present in struct vmcs12: * vmcs12->guest_ssp = evmcs->guest_ssp; */ } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { vmcs12->cpu_based_vm_exec_control = evmcs->cpu_based_vm_exec_control; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { vmcs12->exception_bitmap = evmcs->exception_bitmap; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { vmcs12->vm_entry_controls = evmcs->vm_entry_controls; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { vmcs12->vm_entry_intr_info_field = evmcs->vm_entry_intr_info_field; vmcs12->vm_entry_exception_error_code = evmcs->vm_entry_exception_error_code; vmcs12->vm_entry_instruction_len = evmcs->vm_entry_instruction_len; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { vmcs12->host_ia32_pat = evmcs->host_ia32_pat; vmcs12->host_ia32_efer = evmcs->host_ia32_efer; vmcs12->host_cr0 = evmcs->host_cr0; vmcs12->host_cr3 = evmcs->host_cr3; vmcs12->host_cr4 = evmcs->host_cr4; vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; vmcs12->host_rip = evmcs->host_rip; vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; vmcs12->host_es_selector = evmcs->host_es_selector; vmcs12->host_cs_selector = evmcs->host_cs_selector; vmcs12->host_ss_selector = evmcs->host_ss_selector; vmcs12->host_ds_selector = evmcs->host_ds_selector; vmcs12->host_fs_selector = evmcs->host_fs_selector; vmcs12->host_gs_selector = evmcs->host_gs_selector; vmcs12->host_tr_selector = evmcs->host_tr_selector; vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; /* * Not present in struct vmcs12: * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; * vmcs12->host_ssp = evmcs->host_ssp; * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; */ } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { vmcs12->pin_based_vm_exec_control = evmcs->pin_based_vm_exec_control; vmcs12->vm_exit_controls = evmcs->vm_exit_controls; vmcs12->secondary_vm_exec_control = evmcs->secondary_vm_exec_control; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { vmcs12->io_bitmap_a = evmcs->io_bitmap_a; vmcs12->io_bitmap_b = evmcs->io_bitmap_b; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { vmcs12->msr_bitmap = evmcs->msr_bitmap; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { vmcs12->guest_es_base = evmcs->guest_es_base; vmcs12->guest_cs_base = evmcs->guest_cs_base; vmcs12->guest_ss_base = evmcs->guest_ss_base; vmcs12->guest_ds_base = evmcs->guest_ds_base; vmcs12->guest_fs_base = evmcs->guest_fs_base; vmcs12->guest_gs_base = evmcs->guest_gs_base; vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; vmcs12->guest_tr_base = evmcs->guest_tr_base; vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; vmcs12->guest_idtr_base = evmcs->guest_idtr_base; vmcs12->guest_es_limit = evmcs->guest_es_limit; vmcs12->guest_cs_limit = evmcs->guest_cs_limit; vmcs12->guest_ss_limit = evmcs->guest_ss_limit; vmcs12->guest_ds_limit = evmcs->guest_ds_limit; vmcs12->guest_fs_limit = evmcs->guest_fs_limit; vmcs12->guest_gs_limit = evmcs->guest_gs_limit; vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; vmcs12->guest_tr_limit = evmcs->guest_tr_limit; vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; vmcs12->guest_es_selector = evmcs->guest_es_selector; vmcs12->guest_cs_selector = evmcs->guest_cs_selector; vmcs12->guest_ss_selector = evmcs->guest_ss_selector; vmcs12->guest_ds_selector = evmcs->guest_ds_selector; vmcs12->guest_fs_selector = evmcs->guest_fs_selector; vmcs12->guest_gs_selector = evmcs->guest_gs_selector; vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; vmcs12->guest_tr_selector = evmcs->guest_tr_selector; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { vmcs12->tsc_offset = evmcs->tsc_offset; vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; vmcs12->tsc_multiplier = evmcs->tsc_multiplier; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; vmcs12->guest_cr0 = evmcs->guest_cr0; vmcs12->guest_cr3 = evmcs->guest_cr3; vmcs12->guest_cr4 = evmcs->guest_cr4; vmcs12->guest_dr7 = evmcs->guest_dr7; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { vmcs12->host_fs_base = evmcs->host_fs_base; vmcs12->host_gs_base = evmcs->host_gs_base; vmcs12->host_tr_base = evmcs->host_tr_base; vmcs12->host_gdtr_base = evmcs->host_gdtr_base; vmcs12->host_idtr_base = evmcs->host_idtr_base; vmcs12->host_rsp = evmcs->host_rsp; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { vmcs12->ept_pointer = evmcs->ept_pointer; vmcs12->virtual_processor_id = evmcs->virtual_processor_id; } if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; vmcs12->guest_pending_dbg_exceptions = evmcs->guest_pending_dbg_exceptions; vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; vmcs12->guest_activity_state = evmcs->guest_activity_state; vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; /* * Not present in struct vmcs12: * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; */ } /* * Not used? * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; * vmcs12->page_fault_error_code_mask = * evmcs->page_fault_error_code_mask; * vmcs12->page_fault_error_code_match = * evmcs->page_fault_error_code_match; * vmcs12->cr3_target_count = evmcs->cr3_target_count; * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; */ /* * Read only fields: * vmcs12->guest_physical_address = evmcs->guest_physical_address; * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; * vmcs12->exit_qualification = evmcs->exit_qualification; * vmcs12->guest_linear_address = evmcs->guest_linear_address; * * Not present in struct vmcs12: * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; */ return; #else /* CONFIG_KVM_HYPERV */ KVM_BUG_ON(1, vmx->vcpu.kvm); #endif /* CONFIG_KVM_HYPERV */ } static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) { #ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); /* * Should not be changed by KVM: * * evmcs->host_es_selector = vmcs12->host_es_selector; * evmcs->host_cs_selector = vmcs12->host_cs_selector; * evmcs->host_ss_selector = vmcs12->host_ss_selector; * evmcs->host_ds_selector = vmcs12->host_ds_selector; * evmcs->host_fs_selector = vmcs12->host_fs_selector; * evmcs->host_gs_selector = vmcs12->host_gs_selector; * evmcs->host_tr_selector = vmcs12->host_tr_selector; * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; * evmcs->host_cr0 = vmcs12->host_cr0; * evmcs->host_cr3 = vmcs12->host_cr3; * evmcs->host_cr4 = vmcs12->host_cr4; * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; * evmcs->host_rip = vmcs12->host_rip; * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; * evmcs->host_fs_base = vmcs12->host_fs_base; * evmcs->host_gs_base = vmcs12->host_gs_base; * evmcs->host_tr_base = vmcs12->host_tr_base; * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; * evmcs->host_idtr_base = vmcs12->host_idtr_base; * evmcs->host_rsp = vmcs12->host_rsp; * sync_vmcs02_to_vmcs12() doesn't read these: * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; * evmcs->msr_bitmap = vmcs12->msr_bitmap; * evmcs->ept_pointer = vmcs12->ept_pointer; * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; * evmcs->tpr_threshold = vmcs12->tpr_threshold; * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; * evmcs->exception_bitmap = vmcs12->exception_bitmap; * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; * evmcs->page_fault_error_code_mask = * vmcs12->page_fault_error_code_mask; * evmcs->page_fault_error_code_match = * vmcs12->page_fault_error_code_match; * evmcs->cr3_target_count = vmcs12->cr3_target_count; * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; * evmcs->tsc_offset = vmcs12->tsc_offset; * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; * * Not present in struct vmcs12: * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; * evmcs->host_ssp = vmcs12->host_ssp; * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; * evmcs->guest_ssp = vmcs12->guest_ssp; */ evmcs->guest_es_selector = vmcs12->guest_es_selector; evmcs->guest_cs_selector = vmcs12->guest_cs_selector; evmcs->guest_ss_selector = vmcs12->guest_ss_selector; evmcs->guest_ds_selector = vmcs12->guest_ds_selector; evmcs->guest_fs_selector = vmcs12->guest_fs_selector; evmcs->guest_gs_selector = vmcs12->guest_gs_selector; evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; evmcs->guest_tr_selector = vmcs12->guest_tr_selector; evmcs->guest_es_limit = vmcs12->guest_es_limit; evmcs->guest_cs_limit = vmcs12->guest_cs_limit; evmcs->guest_ss_limit = vmcs12->guest_ss_limit; evmcs->guest_ds_limit = vmcs12->guest_ds_limit; evmcs->guest_fs_limit = vmcs12->guest_fs_limit; evmcs->guest_gs_limit = vmcs12->guest_gs_limit; evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; evmcs->guest_tr_limit = vmcs12->guest_tr_limit; evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; evmcs->guest_es_base = vmcs12->guest_es_base; evmcs->guest_cs_base = vmcs12->guest_cs_base; evmcs->guest_ss_base = vmcs12->guest_ss_base; evmcs->guest_ds_base = vmcs12->guest_ds_base; evmcs->guest_fs_base = vmcs12->guest_fs_base; evmcs->guest_gs_base = vmcs12->guest_gs_base; evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; evmcs->guest_tr_base = vmcs12->guest_tr_base; evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; evmcs->guest_idtr_base = vmcs12->guest_idtr_base; evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; evmcs->guest_pending_dbg_exceptions = vmcs12->guest_pending_dbg_exceptions; evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; evmcs->guest_activity_state = vmcs12->guest_activity_state; evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; evmcs->guest_cr0 = vmcs12->guest_cr0; evmcs->guest_cr3 = vmcs12->guest_cr3; evmcs->guest_cr4 = vmcs12->guest_cr4; evmcs->guest_dr7 = vmcs12->guest_dr7; evmcs->guest_physical_address = vmcs12->guest_physical_address; evmcs->vm_instruction_error = vmcs12->vm_instruction_error; evmcs->vm_exit_reason = vmcs12->vm_exit_reason; evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; evmcs->exit_qualification = vmcs12->exit_qualification; evmcs->guest_linear_address = vmcs12->guest_linear_address; evmcs->guest_rsp = vmcs12->guest_rsp; evmcs->guest_rflags = vmcs12->guest_rflags; evmcs->guest_interruptibility_info = vmcs12->guest_interruptibility_info; evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; evmcs->vm_entry_controls = vmcs12->vm_entry_controls; evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; evmcs->vm_entry_exception_error_code = vmcs12->vm_entry_exception_error_code; evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; evmcs->guest_rip = vmcs12->guest_rip; evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; return; #else /* CONFIG_KVM_HYPERV */ KVM_BUG_ON(1, vmx->vcpu.kvm); #endif /* CONFIG_KVM_HYPERV */ } /* * This is an equivalent of the nested hypervisor executing the vmptrld * instruction. */ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( struct kvm_vcpu *vcpu, bool from_launch) { #ifdef CONFIG_KVM_HYPERV struct vcpu_vmx *vmx = to_vmx(vcpu); bool evmcs_gpa_changed = false; u64 evmcs_gpa; if (likely(!guest_cpu_cap_has_evmcs(vcpu))) return EVMPTRLD_DISABLED; evmcs_gpa = nested_get_evmptr(vcpu); if (!evmptr_is_valid(evmcs_gpa)) { nested_release_evmcs(vcpu); return EVMPTRLD_DISABLED; } if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { vmx->nested.current_vmptr = INVALID_GPA; nested_release_evmcs(vcpu); if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), &vmx->nested.hv_evmcs_map)) return EVMPTRLD_ERROR; vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; /* * Currently, KVM only supports eVMCS version 1 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this * value to first u32 field of eVMCS which should specify eVMCS * VersionNumber. * * Guest should be aware of supported eVMCS versions by host by * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is * expected to set this CPUID leaf according to the value * returned in vmcs_version from nested_enable_evmcs(). * * However, it turns out that Microsoft Hyper-V fails to comply * to their own invented interface: When Hyper-V use eVMCS, it * just sets first u32 field of eVMCS to revision_id specified * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number * which is one of the supported versions specified in * CPUID.0x4000000A.EAX[0:15]. * * To overcome Hyper-V bug, we accept here either a supported * eVMCS version or VMCS12 revision_id as valid values for first * u32 field of eVMCS. */ if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { nested_release_evmcs(vcpu); return EVMPTRLD_VMFAIL; } vmx->nested.hv_evmcs_vmptr = evmcs_gpa; evmcs_gpa_changed = true; /* * Unlike normal vmcs12, enlightened vmcs12 is not fully * reloaded from guest's memory (read only fields, fields not * present in struct hv_enlightened_vmcs, ...). Make sure there * are no leftovers. */ if (from_launch) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); memset(vmcs12, 0, sizeof(*vmcs12)); vmcs12->hdr.revision_id = VMCS12_REVISION; } } /* * Clean fields data can't be used on VMLAUNCH and when we switch * between different L2 guests as KVM keeps a single VMCS12 per L1. */ if (from_launch || evmcs_gpa_changed) { vmx->nested.hv_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; vmx->nested.force_msr_bitmap_recalc = true; } return EVMPTRLD_SUCCEEDED; #else return EVMPTRLD_DISABLED; #endif } void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (nested_vmx_is_evmptr12_valid(vmx)) copy_vmcs12_to_enlightened(vmx); else copy_vmcs12_to_shadow(vmx); vmx->nested.need_vmcs12_to_shadow_sync = false; } static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) { struct vcpu_vmx *vmx = container_of(timer, struct vcpu_vmx, nested.preemption_timer); vmx->nested.preemption_timer_expired = true; kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); kvm_vcpu_kick(&vmx->vcpu); return HRTIMER_NORESTART; } static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; if (!vmx->nested.has_preemption_timer_deadline) { vmx->nested.preemption_timer_deadline = vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; vmx->nested.has_preemption_timer_deadline = true; } return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; } static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, u64 preemption_timeout) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * A timer value of zero is architecturally guaranteed to cause * a VMExit prior to executing any instructions in the guest. */ if (preemption_timeout == 0) { vmx_preemption_timer_fn(&vmx->nested.preemption_timer); return; } if (vcpu->arch.virtual_tsc_khz == 0) return; preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; preemption_timeout *= 1000000; do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); hrtimer_start(&vmx->nested.preemption_timer, ktime_add_ns(ktime_get(), preemption_timeout), HRTIMER_MODE_ABS_PINNED); } static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) return vmcs12->guest_ia32_efer; else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); else return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); } static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) { struct kvm *kvm = vmx->vcpu.kvm; /* * If vmcs02 hasn't been initialized, set the constant vmcs02 state * according to L0's settings (vmcs12 is irrelevant here). Host * fields that come from L0 and are not constant, e.g. HOST_CR3, * will be set as needed prior to VMLAUNCH/VMRESUME. */ if (vmx->nested.vmcs02_initialized) return; vmx->nested.vmcs02_initialized = true; if (vmx->ve_info) vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); if (cpu_has_vmx_posted_intr()) vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); /* * PML is emulated for L2, but never enabled in hardware as the MMU * handles A/D emulation. Disabling PML for L2 also avoids having to * deal with filtering out L2 GPAs from the buffer. */ if (enable_pml) { vmcs_write64(PML_ADDRESS, 0); vmcs_write16(GUEST_PML_INDEX, -1); } if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); if (kvm_notify_vmexit_enabled(kvm)) vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); /* * Set the MSR load/store lists to match L0's settings. Only the * addresses are constant (for vmcs02), the counts can change based * on L2's behavior, e.g. switching to/from long mode. */ vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); vmx_set_constant_host_state(vmx); } static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { prepare_vmcs02_constant_state(vmx); vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the * same VPID as the host. Emulate this behavior by using vpid01 for L2 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter * and VM-Exit are architecturally required to flush VPID=0, but *only* * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the * required flushes), but doing so would cause KVM to over-flush. E.g. * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, * and then runs L2 X again, then KVM can and should retain TLB entries * for VPID12=1. */ if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); else vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); } } static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, struct vmcs12 *vmcs12) { u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) prepare_vmcs02_early_rare(vmx, vmcs12); /* * PIN CONTROLS */ exec_control = __pin_controls_get(vmcs01); exec_control |= (vmcs12->pin_based_vm_exec_control & ~PIN_BASED_VMX_PREEMPTION_TIMER); /* Posted interrupts setting is only taken from vmcs12. */ vmx->nested.pi_pending = false; if (nested_cpu_has_posted_intr(vmcs12)) { vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; } else { vmx->nested.posted_intr_nv = -1; exec_control &= ~PIN_BASED_POSTED_INTR; } pin_controls_set(vmx, exec_control); /* * EXEC CONTROLS */ exec_control = __exec_controls_get(vmcs01); /* L0's desires */ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; vmx->nested.l1_tpr_threshold = -1; if (exec_control & CPU_BASED_TPR_SHADOW) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); #ifdef CONFIG_X86_64 else exec_control |= CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING; #endif /* * A vmexit (to either L1 hypervisor or L0 userspace) is always needed * for I/O port accesses. */ exec_control |= CPU_BASED_UNCOND_IO_EXITING; exec_control &= ~CPU_BASED_USE_IO_BITMAPS; /* * This bit will be computed in nested_get_vmcs12_pages, because * we do not have access to L1's MSR bitmap yet. For now, keep * the same bit as before, hoping to avoid multiple VMWRITEs that * only set/clear this bit. */ exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; exec_controls_set(vmx, exec_control); /* * SECONDARY EXEC CONTROLS */ if (cpu_has_secondary_exec_ctrls()) { exec_control = __secondary_exec_controls_get(vmcs01); /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_DESC); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; /* PML is emulated and never enabled in hardware for L2. */ exec_control &= ~SECONDARY_EXEC_ENABLE_PML; /* VMCS shadowing for L2 is emulated for now */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; /* * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() * will not have to rewrite the controls just for this bit. */ if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) exec_control |= SECONDARY_EXEC_DESC; if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); secondary_exec_controls_set(vmx, exec_control); } /* * ENTRY CONTROLS * * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate * on the related bits (if supported by the CPU) in the hope that * we can avoid VMWrites during vmx_set_efer(). * * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to * do the same for L2. */ exec_control = __vm_entry_controls_get(vmcs01); exec_control |= (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; if (guest_efer != kvm_host.efer) exec_control |= VM_ENTRY_LOAD_IA32_EFER; } vm_entry_controls_set(vmx, exec_control); /* * EXIT CONTROLS * * L2->L1 exit controls are emulated - the hardware exit is to L0 so * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ exec_control = __vm_exit_controls_get(vmcs01); if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; else exec_control &= ~VM_EXIT_LOAD_IA32_EFER; vm_exit_controls_set(vmx, exec_control); /* * Interrupt/Exception Fields */ if (vmx->nested.nested_run_pending) { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs12->vm_entry_intr_info_field); vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, vmcs12->vm_entry_exception_error_code); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmcs12->vm_entry_instruction_len); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); vmx->loaded_vmcs->nmi_known_unmasked = !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); } else { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } } static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, u64 *ssp, u64 *ssp_tbl) { if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) *s_cet = vmcs_readl(GUEST_S_CET); if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { *ssp = vmcs_readl(GUEST_SSP); *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); } } static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, u64 ssp, u64 ssp_tbl) { if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) vmcs_writel(GUEST_S_CET, s_cet); if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { vmcs_writel(GUEST_SSP, ssp); vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); } } static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); vmx_segment_cache_clear(vmx); } if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); /* * L1 may access the L2's PDPTR, so save them to construct * vmcs12 */ if (enable_ept) { vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); } if (kvm_mpx_supported() && vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); } if (nested_cpu_has_xsaves(vmcs12)) vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); /* * Whether page-faults are trapped is determined by a combination of * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 * doesn't care about page faults then we should set all of these to * L1's desires. However, if L0 does care about (some) page faults, it * is not easy (if at all possible?) to merge L0 and L1's desires, we * simply ask to exit on each and every L2 page fault. This is done by * setting MASK=MATCH=0 and (see below) EB.PF=1. * Note that below we don't need special code to set EB.PF beyond the * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when * !enable_ept, EB.PF is 1, so the "or" will always be 1. */ if (vmx_need_pf_intercept(&vmx->vcpu)) { /* * TODO: if both L0 and L1 need the same MASK and MATCH, * go ahead and use it? */ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); } else { vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); } if (cpu_has_vmx_apicv()) { vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); } /* * Make sure the msr_autostore list is up to date before we set the * count in the vmcs02. */ prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); set_cr4_guest_host_mask(vmx); } /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 * guest in a way that will both be appropriate to L1's requests, and our * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various * vcpu->arch fields. * Returns 0 on success, 1 on failure. Invalid state exit qualification code * is assigned to entry_failure_code on failure. */ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool from_vmentry, enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); bool load_guest_pdptrs_vmcs12 = false; if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { prepare_vmcs02_rare(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & vmx_get_supported_debugctl(vcpu, false)); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, vmx->nested.pre_vmenter_ssp, vmx->nested.pre_vmenter_ssp_tbl); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to * trap. Note that CR0.TS also needs updating - we do this later. */ vmx_update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); } vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( vcpu->arch.l1_tsc_offset, vmx_get_l2_tsc_offset(vcpu), vmx_get_l2_tsc_multiplier(vcpu)); vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( vcpu->arch.l1_tsc_scaling_ratio, vmx_get_l2_tsc_multiplier(vcpu)); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); if (nested_cpu_has_ept(vmcs12)) nested_ept_init_mmu_context(vcpu); /* * Override the CR0/CR4 read shadows after setting the effective guest * CR0/CR4. The common helpers also set the shadows, but they don't * account for vmcs12's cr0/4_guest_host_mask. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); vmx_set_cr4(vcpu, vmcs12->guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ vmx_set_efer(vcpu, vcpu->arch.efer); /* * Guest state is invalid and unrestricted guest is disabled, * which means L1 attempted VMEntry to L2 with invalid state. * Fail the VMEntry. * * However when force loading the guest state (SMM exit or * loading nested state after migration, it is possible to * have invalid guest state now, which will be later fixed by * restoring L2 register state */ if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), from_vmentry, entry_failure_code)) return -EINVAL; /* * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 * on nested VM-Exit, which can occur without actually running L2 and * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the * transition to HLT instead of running L2. */ if (enable_ept) vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } kvm_rsp_write(vcpu, vmcs12->guest_rsp); kvm_rip_write(vcpu, vmcs12->guest_rip); /* * It was observed that genuine Hyper-V running in L1 doesn't reset * 'hv_clean_fields' by itself, it only sets the corresponding dirty * bits when it changes a field in eVMCS. Mark all fields as clean * here. */ if (nested_vmx_is_evmptr12_valid(vmx)) evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; return 0; } static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) { if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && nested_cpu_has_virtual_nmis(vmcs12))) return -EINVAL; if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) return -EINVAL; return 0; } static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* Check for memory type validity */ switch (new_eptp & VMX_EPTP_MT_MASK) { case VMX_EPTP_MT_UC: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) return false; break; case VMX_EPTP_MT_WB: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) return false; break; default: return false; } /* Page-walk levels validity. */ switch (new_eptp & VMX_EPTP_PWL_MASK) { case VMX_EPTP_PWL_5: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) return false; break; case VMX_EPTP_PWL_4: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) return false; break; default: return false; } /* Reserved bits should not be set */ if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) return false; } return true; } /* * Checks related to VM-Execution Control Fields */ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, vmx->nested.msrs.pinbased_ctls_low, vmx->nested.msrs.pinbased_ctls_high)) || CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.msrs.procbased_ctls_low, vmx->nested.msrs.procbased_ctls_high))) return -EINVAL; if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, vmx->nested.msrs.secondary_ctls_low, vmx->nested.msrs.secondary_ctls_high))) return -EINVAL; if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || nested_vmx_check_apic_access_controls(vcpu, vmcs12) || nested_vmx_check_apicv_controls(vcpu, vmcs12) || nested_vmx_check_nmi_controls(vmcs12) || nested_vmx_check_pml_controls(vcpu, vmcs12) || nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) return -EINVAL; if (!nested_cpu_has_preemption_timer(vmcs12) && nested_cpu_has_save_preemption_timer(vmcs12)) return -EINVAL; if (nested_cpu_has_ept(vmcs12) && CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) return -EINVAL; if (nested_cpu_has_vmfunc(vmcs12)) { if (CC(vmcs12->vm_function_control & ~vmx->nested.msrs.vmfunc_controls)) return -EINVAL; if (nested_cpu_has_eptp_switching(vmcs12)) { if (CC(!nested_cpu_has_ept(vmcs12)) || CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) return -EINVAL; } } if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && CC(!vmcs12->tsc_multiplier)) return -EINVAL; return 0; } /* * Checks related to VM-Exit Control Fields */ static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, vmx->nested.msrs.exit_ctls_low, vmx->nested.msrs.exit_ctls_high)) || CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) return -EINVAL; return 0; } /* * Checks related to VM-Entry Control Fields */ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, vmx->nested.msrs.entry_ctls_low, vmx->nested.msrs.entry_ctls_high))) return -EINVAL; /* * From the Intel SDM, volume 3: * Fields relevant to VM-entry event injection must be set properly. * These fields are the VM-entry interruption-information field, the * VM-entry exception error code, and the VM-entry instruction length. */ if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { u32 intr_info = vmcs12->vm_entry_intr_info_field; u8 vector = intr_info & INTR_INFO_VECTOR_MASK; u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; bool urg = nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST); bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; /* VM-entry interruption-info field: interruption type */ if (CC(intr_type == INTR_TYPE_RESERVED) || CC(intr_type == INTR_TYPE_OTHER_EVENT && !nested_cpu_supports_monitor_trap_flag(vcpu))) return -EINVAL; /* VM-entry interruption-info field: vector */ if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) return -EINVAL; /* * Cannot deliver error code in real mode or if the interrupt * type is not hardware exception. For other cases, do the * consistency check only if the vCPU doesn't enumerate * VMX_BASIC_NO_HW_ERROR_CODE_CC. */ if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { if (CC(has_error_code)) return -EINVAL; } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { if (CC(has_error_code != x86_exception_has_error_code(vector))) return -EINVAL; } /* VM-entry exception error code */ if (CC(has_error_code && vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) return -EINVAL; /* VM-entry interruption-info field: reserved bits */ if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) return -EINVAL; /* VM-entry instruction length */ switch (intr_type) { case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || CC(vmcs12->vm_entry_instruction_len == 0 && CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; } } if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) return -EINVAL; return 0; } static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (nested_check_vm_execution_controls(vcpu, vmcs12) || nested_check_vm_exit_controls(vcpu, vmcs12) || nested_check_vm_entry_controls(vcpu, vmcs12)) return -EINVAL; #ifdef CONFIG_KVM_HYPERV if (guest_cpu_cap_has_evmcs(vcpu)) return nested_evmcs_check_controls(vmcs12); #endif return 0; } static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; /* * Don't bother with the consistency checks if KVM isn't configured to * WARN on missed consistency checks, as KVM needs to rely on hardware * to fully detect an illegal vTPR vs. TRP Threshold combination due to * the vTPR being writable by L1 at all times (it's an in-memory value, * not a VMCS field). I.e. even if the check passes now, it might fail * at the actual VM-Enter. * * Keying off the module param also allows treating an invalid vAPIC * mapping as a consistency check failure without increasing the risk * of breaking a "real" VM. */ if (!warn_on_missed_cc) return 0; if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && !nested_cpu_has_vid(vmcs12) && !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && (CC(!vapic) || CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) return -EINVAL; return 0; } static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { #ifdef CONFIG_X86_64 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != !!(vcpu->arch.efer & EFER_LMA))) return -EINVAL; #endif return 0; } static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) { /* * Check that the given linear address is canonical after a VM exit * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. */ u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; return !__is_canonical_address(la, l1_address_bits_on_exit); } static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, u64 ssp, u64 ssp_tbl) { if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) return -EINVAL; return 0; } static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) return -EINVAL; if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) return -EINVAL; if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), vmcs12->host_ia32_perf_global_ctrl))) return -EINVAL; if (ia32e) { if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) return -EINVAL; } else { if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || CC((vmcs12->host_rip) >> 32)) return -EINVAL; } if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_cs_selector == 0) || CC(vmcs12->host_tr_selector == 0) || CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) return -EINVAL; /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, * the values of the LMA and LME bits in the field must each be that of * the host address-space size VM-exit control. */ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) return -EINVAL; } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, vmcs12->host_ssp_tbl)) return -EINVAL; /* * IA32_S_CET and SSP must be canonical if the host will * enter 64-bit mode after VM-exit; otherwise, higher * 32-bits must be all 0s. */ if (ia32e) { if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) return -EINVAL; } else { if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) return -EINVAL; } } return 0; } static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; struct vmcs_hdr hdr; if (vmcs12->vmcs_link_pointer == INVALID_GPA) return 0; if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) return -EINVAL; if (ghc->gpa != vmcs12->vmcs_link_pointer && CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmcs12->vmcs_link_pointer, VMCS12_SIZE))) return -EINVAL; if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, offsetof(struct vmcs12, hdr), sizeof(hdr)))) return -EINVAL; if (CC(hdr.revision_id != VMCS12_REVISION) || CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) return -EINVAL; return 0; } /* * Checks related to Guest Non-register State */ static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) { if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) return -EINVAL; return 0; } static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enum vm_entry_failure_code *entry_failure_code) { bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); *entry_failure_code = ENTRY_FAIL_DEFAULT; if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; return -EINVAL; } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), vmcs12->guest_ia32_perf_global_ctrl))) return -EINVAL; if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) return -EINVAL; if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) return -EINVAL; /* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: * - Bits reserved in the IA32_EFER MSR must be 0. * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of * the IA-32e mode guest VM-exit control. It must also be identical * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to * CR0.PG) is 1. */ if (to_vmx(vcpu)->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || CC(((vmcs12->guest_cr0 & X86_CR0_PG) && ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) return -EINVAL; } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, vmcs12->guest_ssp_tbl)) return -EINVAL; /* * Guest SSP must have 63:N bits identical, rather than * be canonical (i.e., 63:N-1 bits identical), where N is * the CPU's maximum linear-address width. Similar to * is_noncanonical_msr_address(), use the host's * linear-address width. */ if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) return -EINVAL; } if (nested_check_guest_non_reg_state(vmcs12)) return -EINVAL; return 0; } #ifdef CONFIG_KVM_HYPERV static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * hv_evmcs may end up being not mapped after migration (when * L2 was running), map it here to make sure vmcs12 changes are * properly reflected. */ if (guest_cpu_cap_has_evmcs(vcpu) && vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { enum nested_evmptrld_status evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, false); if (evmptrld_status == EVMPTRLD_VMFAIL || evmptrld_status == EVMPTRLD_ERROR) return false; /* * Post migration VMCS12 always provides the most actual * information, copy it to eVMCS upon entry. */ vmx->nested.need_vmcs12_to_shadow_sync = true; } return true; } #endif static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_host_map *map; if (!vcpu->arch.pdptrs_from_userspace && !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { /* * Reload the guest's PDPTRs since after a migration * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs. */ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; } if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { map = &vmx->nested.apic_access_page_map; if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); } else { pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", __func__); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; return false; } } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { map = &vmx->nested.virtual_apic_map; if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * The processor will never use the TPR shadow, simply * clear the bit from the execution control. Such a * configuration is useless, but it happens in tests. * For any other configuration, failing the vm entry is * _not_ what the processor does but it's basically the * only possibility we have. */ exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); } else { /* * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to * force VM-Entry to fail. */ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); } } if (nested_cpu_has_posted_intr(vmcs12)) { map = &vmx->nested.pi_desc_map; if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { vmx->nested.pi_desc = (struct pi_desc *)(((void *)map->hva) + offset_in_page(vmcs12->posted_intr_desc_addr)); vmcs_write64(POSTED_INTR_DESC_ADDR, pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); } else { /* * Defer the KVM_INTERNAL_EXIT until KVM tries to * access the contents of the VMCS12 posted interrupt * descriptor. (Note that KVM may do this when it * should not, per the architectural specification.) */ vmx->nested.pi_desc = NULL; pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); } } if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); return true; } static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) { #ifdef CONFIG_KVM_HYPERV /* * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post * migration. */ if (!nested_get_evmcs_page(vcpu)) { pr_debug_ratelimited("%s: enlightened vmptrld failed\n", __func__); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; return false; } #endif if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) return false; return true; } static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t dst; if (WARN_ON_ONCE(!is_guest_mode(vcpu))) return 0; if (WARN_ON_ONCE(vmx->nested.pml_full)) return 1; /* * Check if PML is enabled for the nested guest. Whether eptp bit 6 is * set is already checked as part of A/D emulation. */ vmcs12 = get_vmcs12(vcpu); if (!nested_cpu_has_pml(vmcs12)) return 0; if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { vmx->nested.pml_full = true; return 1; } gpa &= ~0xFFFull; dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, offset_in_page(dst), sizeof(gpa))) return 0; vmcs12->guest_pml_index--; return 0; } /* * Intel's VMX Instruction Reference specifies a common set of prerequisites * for running VMX instructions (except VMXON, whose prerequisites are * slightly different). It also specifies what exception to inject otherwise. * Note that many of these exceptions have priority over VM exits, so they * don't have to be checked again here. */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { if (!to_vmx(vcpu)->nested.vmxon) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); return 0; } return 1; } static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); /* * If from_vmentry is false, this is being called from state restore (either RSM * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. * * Returns: * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode * NVMX_VMENTRY_VMFAIL: Consistency check VMFail * NVMX_VMENTRY_VMEXIT: Consistency check VMExit * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error */ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); enum vm_entry_failure_code entry_failure_code; union vmx_exit_reason exit_reason = { .basic = EXIT_REASON_INVALID_STATE, .failed_vmentry = 1, }; u32 failed_index; trace_kvm_nested_vmenter(kvm_rip_read(vcpu), vmx->nested.current_vmptr, vmcs12->guest_rip, vmcs12->guest_intr_status, vmcs12->vm_entry_intr_info_field, vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, vmcs12->ept_pointer, vmcs12->guest_cr3, KVM_ISA_VMX); kvm_service_local_tlb_flush_requests(vcpu); if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, &vmx->nested.pre_vmenter_ssp, &vmx->nested.pre_vmenter_ssp_tbl); /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but * not KVM, KVM must unwind its software model to the pre-VM-Entry host * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not * L1's "real" CR3, which causes nested_vmx_restore_host_state() to * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the * unwind naturally setting arch.cr3 to the correct value. Smashing * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be * overwritten with a shadow CR3 prior to re-entering L1. */ if (!enable_ept) vmcs_writel(GUEST_CR3, vcpu->arch.cr3); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); if (from_vmentry) { if (unlikely(!nested_get_vmcs12_pages(vcpu))) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_KVM_INTERNAL_ERROR; } if (nested_vmx_check_controls_late(vcpu, vmcs12)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_VMFAIL; } if (nested_vmx_check_guest_state(vcpu, vmcs12, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit; } } enter_guest_mode(vcpu); if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit_guest_mode; } if (from_vmentry) { failed_index = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, vmcs12->vm_entry_msr_load_count); if (failed_index) { exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; vmcs12->exit_qualification = failed_index; goto vmentry_fail_vmexit_guest_mode; } } else { /* * The MMU is not initialized to point at the right entities yet and * "get pages" would need to read data from the guest (i.e. we will * need to perform gpa to hpa translation). Request a call * to nested_get_vmcs12_pages before the next VM-entry. The MSRs * have already been set at vmentry time and should not be reset. */ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } /* * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit * unconditionally. Take care to pull data from vmcs01 as appropriate, * e.g. when checking for interrupt windows, as vmcs02 is now loaded. */ if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING)) || kvm_apic_has_pending_init_or_sipi(vcpu) || kvm_apic_has_interrupt(vcpu)) kvm_make_request(KVM_REQ_EVENT, vcpu); /* * Do not start the preemption timer hrtimer until after we know * we are successful, so that only nested_vmx_vmexit needs to cancel * the timer. */ vmx->nested.preemption_timer_expired = false; if (nested_cpu_has_preemption_timer(vmcs12)) { u64 timer_value = vmx_calc_preemption_timer_value(vcpu); vmx_start_preemption_timer(vcpu, timer_value); } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet * returned as far as L1 is concerned. It will only return (and set * the success flag) when L2 exits (see nested_vmx_vmexit()). */ return NVMX_VMENTRY_SUCCESS; /* * A failed consistency check that leads to a VMExit during L1's * VMEnter to L2 is a variation of a normal VMexit, as explained in * 26.7 "VM-entry failures during or after loading guest state". */ vmentry_fail_vmexit_guest_mode: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); vmentry_fail_vmexit: vmx_switch_vmcs(vcpu, &vmx->vmcs01); if (!from_vmentry) return NVMX_VMENTRY_VMEXIT; load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason.full; if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; } /* * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 * for running an L2 nested guest. */ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; enum nvmx_vmentry_status status; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); enum nested_evmptrld_status evmptrld_status; if (!nested_vmx_check_permission(vcpu)) return 1; evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); if (evmptrld_status == EVMPTRLD_ERROR) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } kvm_pmu_branch_retired(vcpu); if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); if (CC(!nested_vmx_is_evmptr12_valid(vmx) && vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); /* * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact * that there *is* a valid VMCS pointer, RFLAGS.CF is set * rather than RFLAGS.ZF, and no error number is stored to the * VM-instruction error field. */ if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); if (nested_vmx_is_evmptr12_valid(vmx)) { struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); /* Enlightened VMCS doesn't have launch state */ vmcs12->launch_state = !launch; } else if (enable_shadow_vmcs) { copy_shadow_to_vmcs12(vmx); } /* * The nested entry process starts with enforcing various prerequisites * on vmcs12 as required by the Intel SDM, and act appropriately when * they fail: As the SDM explains, some conditions should cause the * instruction to fail, while others will cause the instruction to seem * to succeed, but return an EXIT_REASON_INVALID_STATE. * To speed up the normal (success) code path, we should avoid checking * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); if (CC(vmcs12->launch_state == launch)) return nested_vmx_fail(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); if (nested_vmx_check_controls(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); if (nested_vmx_check_address_space_size(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); if (nested_vmx_check_host_state(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); /* * We're finally done with prerequisite checking, and can start with * the nested entry. */ vmx->nested.nested_run_pending = 1; vmx->nested.has_preemption_timer_deadline = false; status = nested_vmx_enter_non_root_mode(vcpu, true); if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ kvm_request_l1tf_flush_l1d(); /* * Must happen outside of nested_vmx_enter_non_root_mode() as it will * also be used as part of restoring nVMX state for * snapshot restore (migration). * * In this flow, it is assumed that vmcs12 cache was * transferred as part of captured nVMX state and should * therefore not be read from guest memory (which may not * exist on destination host yet). */ nested_cache_shadow_vmcs12(vcpu, vmcs12); switch (vmcs12->guest_activity_state) { case GUEST_ACTIVITY_HLT: /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be * awakened by event injection or by an NMI-window VM-exit or * by an interrupt-window VM-exit, halt the vcpu. */ if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; return kvm_emulate_halt_noskip(vcpu); } break; case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: break; } return 1; vmentry_failed: vmx->nested.nested_run_pending = 0; if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) return 0; if (status == NVMX_VMENTRY_VMEXIT) return 1; WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); } /* * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). * This function returns the new value we should put in vmcs12.guest_cr0. * It's not enough to just return the vmcs02 GUEST_CR0. Rather, * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 * didn't trap the bit, because if L1 did, so would L0). * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have * been modified by L2, and L1 knows it. So just leave the old value of * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 * isn't relevant, because if L0 traps this bit it can set it to anything. * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have * changed these bits, and therefore they need to be updated, but L0 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. */ static inline unsigned long vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { return /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | vcpu->arch.cr0_guest_owned_bits)); } static inline unsigned long vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { return /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | vcpu->arch.cr4_guest_owned_bits)); } static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info) { u32 idt_vectoring; unsigned int nr; /* * Per the SDM, VM-Exits due to double and triple faults are never * considered to occur during event delivery, even if the double/triple * fault is the result of an escalating vectoring issue. * * Note, the SDM qualifies the double fault behavior with "The original * event results in a double-fault exception". It's unclear why the * qualification exists since exits due to double fault can occur only * while vectoring a different exception (injected events are never * subject to interception), i.e. there's _always_ an original event. * * The SDM also uses NMI as a confusing example for the "original event * causes the VM exit directly" clause. NMI isn't special in any way, * the same rule applies to all events that cause an exit directly. * NMI is an odd choice for the example because NMIs can only occur on * instruction boundaries, i.e. they _can't_ occur during vectoring. */ if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && is_double_fault(exit_intr_info))) { vmcs12->idt_vectoring_info_field = 0; } else if (vcpu->arch.exception.injected) { nr = vcpu->arch.exception.vector; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; if (kvm_exception_is_soft(nr)) { vmcs12->vm_exit_instruction_len = vcpu->arch.event_exit_inst_len; idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; } else idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; if (vcpu->arch.exception.has_error_code) { idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; vmcs12->idt_vectoring_error_code = vcpu->arch.exception.error_code; } vmcs12->idt_vectoring_info_field = idt_vectoring; } else if (vcpu->arch.nmi_injected) { vmcs12->idt_vectoring_info_field = INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; } else if (vcpu->arch.interrupt.injected) { nr = vcpu->arch.interrupt.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; if (vcpu->arch.interrupt.soft) { idt_vectoring |= INTR_TYPE_SOFT_INTR; vmcs12->vm_entry_instruction_len = vcpu->arch.event_exit_inst_len; } else idt_vectoring |= INTR_TYPE_EXT_INTR; vmcs12->idt_vectoring_info_field = idt_vectoring; } else { vmcs12->idt_vectoring_info_field = 0; } } void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); gfn_t gfn; /* * Don't need to mark the APIC access page dirty; it is never * written to by the CPU during APIC virtualization. */ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; kvm_vcpu_mark_page_dirty(vcpu, gfn); } if (nested_cpu_has_posted_intr(vmcs12)) { gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; kvm_vcpu_mark_page_dirty(vcpu, gfn); } } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; void *vapic_page; u16 status; if (!vmx->nested.pi_pending) return 0; if (!vmx->nested.pi_desc) goto mmio_needed; vmx->nested.pi_pending = false; if (!pi_test_and_clear_on(vmx->nested.pi_desc)) return 0; max_irr = pi_find_highest_vector(vmx->nested.pi_desc); if (max_irr > 0) { vapic_page = vmx->nested.virtual_apic_map.hva; if (!vapic_page) goto mmio_needed; __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page, &max_irr); status = vmcs_read16(GUEST_INTR_STATUS); if ((u8)max_irr > ((u8)status & 0xff)) { status &= ~0xff; status |= (u8)max_irr; vmcs_write16(GUEST_INTR_STATUS, status); } } nested_mark_vmcs12_pages_dirty(vcpu); return 0; mmio_needed: kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); return -ENXIO; } static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned long exit_qual; if (ex->has_payload) { exit_qual = ex->payload; } else if (ex->vector == PF_VECTOR) { exit_qual = vcpu->arch.cr2; } else if (ex->vector == DB_VECTOR) { exit_qual = vcpu->arch.dr6; exit_qual &= ~DR6_BT; exit_qual ^= DR6_ACTIVE_LOW; } else { exit_qual = 0; } /* * Unlike AMD's Paged Real Mode, which reports an error code on #PF * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the * "has error code" flags on VM-Exit if the CPU is in Real Mode. */ if (ex->has_error_code && is_protmode(vcpu)) { /* * Intel CPUs do not generate error codes with bits 31:16 set, * and more importantly VMX disallows setting bits 31:16 in the * injected error code for VM-Entry. Drop the bits to mimic * hardware and avoid inducing failure on nested VM-Entry if L1 * chooses to inject the exception back to L2. AMD CPUs _do_ * generate "full" 32-bit error codes, so KVM allows userspace * to inject exception error codes with bits 31:16 set. */ vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; intr_info |= INTR_INFO_DELIVER_CODE_MASK; } if (kvm_exception_is_soft(ex->vector)) intr_info |= INTR_TYPE_SOFT_EXCEPTION; else intr_info |= INTR_TYPE_HARD_EXCEPTION; if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && vmx_get_nmi_mask(vcpu)) intr_info |= INTR_INFO_UNBLOCK_NMI; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); } /* * Returns true if a debug trap is (likely) pending delivery. Infer the class * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). * Using the payload is flawed because code breakpoints (fault-like) and data * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. * this will return false positives if a to-be-injected code breakpoint #DB is * pending (from KVM's perspective, but not "pending" across an instruction * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it * too is trap-like. * * KVM "works" despite these flaws as ICEBP isn't currently supported by the * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the * #DB has already happened), and MTF isn't marked pending on code breakpoints * from the emulator (because such #DBs are fault-like and thus don't trigger * actions that fire on instruction retire). */ static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) { if (!ex->pending || ex->vector != DB_VECTOR) return 0; /* General Detect #DBs are always fault-like. */ return ex->payload & ~DR6_BD; } /* * Returns true if there's a pending #DB exception that is lower priority than * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by * KVM, but could theoretically be injected by userspace. Note, this code is * imperfect, see above. */ static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) { return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; } /* * Certain VM-exits set the 'pending debug exceptions' field to indicate a * recognized #DB (data or single-step) that has yet to be delivered. Since KVM * represents these debug traps with a payload that is said to be compatible * with the 'pending debug exceptions' field, write the payload to the VMCS * field if a VM-exit is delivered before the debug trap. */ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) { unsigned long pending_dbg; pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); if (pending_dbg) vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); } static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) { return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && to_vmx(vcpu)->nested.preemption_timer_expired; } static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_vmx *vmx = to_vmx(vcpu); void *vapic = vmx->nested.virtual_apic_map.hva; int max_irr, vppr; if (nested_vmx_preemption_timer_pending(vcpu) || vmx->nested.mtf_pending) return true; /* * Virtual Interrupt Delivery doesn't require manual injection. Either * the interrupt is already in GUEST_RVI and will be recognized by CPU * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move * the interrupt from the PIR to RVI prior to entering the guest. */ if (for_injection) return false; if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || __vmx_interrupt_blocked(vcpu)) return false; if (!vapic) return false; vppr = *((u32 *)(vapic + APIC_PROCPRI)); max_irr = vmx_get_rvi(); if ((max_irr & 0xf0) > (vppr & 0xf0)) return true; if (vmx->nested.pi_pending && vmx->nested.pi_desc && pi_test_on(vmx->nested.pi_desc)) { max_irr = pi_find_highest_vector(vmx->nested.pi_desc); if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) return true; } return false; } /* * Per the Intel SDM's table "Priority Among Concurrent Events", with minor * edits to fill in missing examples, e.g. #DB due to split-lock accesses, * and less minor edits to splice in the priority of VMX Non-Root specific * events, e.g. MTF and NMI/INTR-window exiting. * * 1 Hardware Reset and Machine Checks * - RESET * - Machine Check * * 2 Trap on Task Switch * - T flag in TSS is set (on task switch) * * 3 External Hardware Interventions * - FLUSH * - STOPCLK * - SMI * - INIT * * 3.5 Monitor Trap Flag (MTF) VM-exit[1] * * 4 Traps on Previous Instruction * - Breakpoints * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O * breakpoint, or #DB due to a split-lock access) * * 4.3 VMX-preemption timer expired VM-exit * * 4.6 NMI-window exiting VM-exit[2] * * 5 Nonmaskable Interrupts (NMI) * * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery * * 6 Maskable Hardware Interrupts * * 7 Code Breakpoint Fault * * 8 Faults from Fetching Next Instruction * - Code-Segment Limit Violation * - Code Page Fault * - Control protection exception (missing ENDBRANCH at target of indirect * call or jump) * * 9 Faults from Decoding Next Instruction * - Instruction length > 15 bytes * - Invalid Opcode * - Coprocessor Not Available * *10 Faults on Executing Instruction * - Overflow * - Bound error * - Invalid TSS * - Segment Not Present * - Stack fault * - General Protection * - Data Page Fault * - Alignment Check * - x86 FPU Floating-point exception * - SIMD floating-point exception * - Virtualization exception * - Control protection exception * * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), * INIT signals, and higher priority events take priority over MTF VM exits. * MTF VM exits take priority over debug-trap exceptions and lower priority * events. * * [2] Debug-trap exceptions and higher priority events take priority over VM exits * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption * timer take priority over VM exits caused by the "NMI-window exiting" * VM-execution control and lower priority events. * * [3] Debug-trap exceptions and higher priority events take priority over VM exits * caused by "NMI-window exiting". VM exits caused by this control take * priority over non-maskable interrupts (NMIs) and lower priority events. * * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, * non-maskable interrupts (NMIs) and higher priority events take priority over * delivery of a virtual interrupt; delivery of a virtual interrupt takes * priority over external interrupts and lower priority events. */ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Only a pending nested run blocks a pending exception. If there is a * previously injected event, the pending exception occurred while said * event was being delivered and thus needs to be handled. */ bool block_nested_exceptions = vmx->nested.nested_run_pending; /* * Events that don't require injection, i.e. that are virtualized by * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need * to regain control in order to deliver the event, and hardware will * handle event ordering, e.g. with respect to injected exceptions. * * But, new events (not exceptions) are only recognized at instruction * boundaries. If an event needs reinjection, then KVM is handling a * VM-Exit that occurred _during_ instruction execution; new events, * irrespective of whether or not they're injected, are blocked until * the instruction completes. */ bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); /* * Inject events are blocked by nested VM-Enter, as KVM is responsible * for managing priority between concurrent events, i.e. KVM needs to * wait until after VM-Enter completes to deliver injected events. */ bool block_nested_events = block_nested_exceptions || block_non_injected_events; if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { if (block_nested_events) return -EBUSY; nested_vmx_update_pending_dbg(vcpu); clear_bit(KVM_APIC_INIT, &apic->pending_events); if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); /* MTF is discarded if the vCPU is in WFS. */ vmx->nested.mtf_pending = false; return 0; } if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (block_nested_events) return -EBUSY; clear_bit(KVM_APIC_SIPI, &apic->pending_events); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, apic->sipi_vector & 0xFFUL); return 0; } /* Fallthrough, the SIPI is completely ignored. */ } /* * Process exceptions that are higher priority than Monitor Trap Flag: * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but * could theoretically come in from userspace), and ICEBP (INT1). * * TODO: SMIs have higher priority than MTF and trap-like #DBs (except * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF * across SMI/RSM as it should; that needs to be addressed in order to * prioritize SMI over MTF and trap-like #DBs. */ if (vcpu->arch.exception_vmexit.pending && !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { if (block_nested_exceptions) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu); return 0; } if (vcpu->arch.exception.pending && !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { if (block_nested_exceptions) return -EBUSY; goto no_vmexit; } if (vmx->nested.mtf_pending) { if (block_nested_events) return -EBUSY; nested_vmx_update_pending_dbg(vcpu); nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); return 0; } if (vcpu->arch.exception_vmexit.pending) { if (block_nested_exceptions) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu); return 0; } if (vcpu->arch.exception.pending) { if (block_nested_exceptions) return -EBUSY; goto no_vmexit; } if (nested_vmx_preemption_timer_pending(vcpu)) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); return 0; } if (vcpu->arch.smi_pending && !is_smm(vcpu)) { if (block_nested_events) return -EBUSY; goto no_vmexit; } if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; if (!nested_exit_on_nmi(vcpu)) goto no_vmexit; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, NMI_VECTOR | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK, 0); /* * The NMI-triggered VM exit counts as injection: * clear this one and block further NMIs. */ vcpu->arch.nmi_pending = 0; vmx_set_nmi_mask(vcpu, true); return 0; } if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { int irq; if (!nested_exit_on_intr(vcpu)) { if (block_nested_events) return -EBUSY; goto no_vmexit; } if (!nested_exit_intr_ack_set(vcpu)) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); return 0; } irq = kvm_cpu_get_extint(vcpu); if (irq != -1) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); return 0; } irq = kvm_apic_has_interrupt(vcpu); if (WARN_ON_ONCE(irq < 0)) goto no_vmexit; /* * If the IRQ is L2's PI notification vector, process posted * interrupts for L2 instead of injecting VM-Exit, as the * detection/morphing architecturally occurs when the IRQ is * delivered to the CPU. Note, only interrupts that are routed * through the local APIC trigger posted interrupt processing, * and enabling posted interrupts requires ACK-on-exit. */ if (irq == vmx->nested.posted_intr_nv) { /* * Nested posted interrupts are delivered via RVI, i.e. * aren't injected by KVM, and so can be queued even if * manual event injection is disallowed. */ if (block_non_injected_events) return -EBUSY; vmx->nested.pi_pending = true; kvm_apic_clear_irr(vcpu, irq); goto no_vmexit; } if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); /* * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI * if APICv is active. */ kvm_apic_ack_interrupt(vcpu, irq); return 0; } no_vmexit: return vmx_complete_nested_posted_interrupt(vcpu); } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) { ktime_t remaining = hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); u64 value; if (ktime_to_ns(remaining) <= 0) return 0; value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; do_div(value, 1000000); return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; } static bool is_vmcs12_ext_field(unsigned long field) { switch (field) { case GUEST_ES_SELECTOR: case GUEST_CS_SELECTOR: case GUEST_SS_SELECTOR: case GUEST_DS_SELECTOR: case GUEST_FS_SELECTOR: case GUEST_GS_SELECTOR: case GUEST_LDTR_SELECTOR: case GUEST_TR_SELECTOR: case GUEST_ES_LIMIT: case GUEST_CS_LIMIT: case GUEST_SS_LIMIT: case GUEST_DS_LIMIT: case GUEST_FS_LIMIT: case GUEST_GS_LIMIT: case GUEST_LDTR_LIMIT: case GUEST_TR_LIMIT: case GUEST_GDTR_LIMIT: case GUEST_IDTR_LIMIT: case GUEST_ES_AR_BYTES: case GUEST_DS_AR_BYTES: case GUEST_FS_AR_BYTES: case GUEST_GS_AR_BYTES: case GUEST_LDTR_AR_BYTES: case GUEST_TR_AR_BYTES: case GUEST_ES_BASE: case GUEST_CS_BASE: case GUEST_SS_BASE: case GUEST_DS_BASE: case GUEST_FS_BASE: case GUEST_GS_BASE: case GUEST_LDTR_BASE: case GUEST_TR_BASE: case GUEST_GDTR_BASE: case GUEST_IDTR_BASE: case GUEST_PENDING_DBG_EXCEPTIONS: case GUEST_BNDCFGS: return true; default: break; } return false; } static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; } static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) return; WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); cpu = get_cpu(); vmx->loaded_vmcs = &vmx->nested.vmcs02; vmx_vcpu_load_vmcs(vcpu, cpu); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->loaded_vmcs = &vmx->vmcs01; vmx_vcpu_load_vmcs(vcpu, cpu); put_cpu(); } /* * Update the guest state fields of vmcs12 to reflect changes that * occurred while L2 was running. (The "IA-32e mode guest" bit of the * VM-entry controls is also updated, since this is really a guest * state bit.) */ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (nested_vmx_is_evmptr12_valid(vmx)) sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !nested_vmx_is_evmptr12_valid(vmx); vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); vmcs12->guest_rsp = kvm_rsp_read(vcpu); vmcs12->guest_rip = kvm_rip_read(vcpu); vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); vmcs12->guest_interruptibility_info = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; if (nested_cpu_has_preemption_timer(vmcs12) && vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && !vmx->nested.nested_run_pending) vmcs12->vmx_preemption_timer_value = vmx_get_preemption_timer_value(vcpu); /* * In some cases (usually, nested EPT), L2 is allowed to change its * own CR3 without exiting. If it has changed it, we must keep it. * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. * * Additionally, restore L2's PDPTR to vmcs12. */ if (enable_ept) { vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } } vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); /* * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. * Writes to DEBUGCTL that aren't intercepted by L1 are immediately * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into * vmcs02 doesn't strictly track vmcs12. */ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) vmcs12->guest_dr7 = vcpu->arch.dr7; if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, &vmcs12->guest_ssp, &vmcs12->guest_ssp_tbl); } /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), * and this function updates it to reflect the changes to the guest state while * L2 was running (and perhaps made some exits which were handled directly by L0 * without going back to L1), and to reflect the exit reason. * Note that we do not have to copy here all VMCS fields, just those that * could have changed by the L2 guest or the exit - i.e., the guest-state and * exit-information fields only. Other fields are modified by L1 with VMWRITE, * which already writes to vmcs12 directly. */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification, u32 exit_insn_len) { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; if (vmx_get_exit_reason(vcpu).enclave_mode) vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; /* * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other * exit info fields are unmodified. */ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { vmcs12->launch_state = 1; /* vm_entry_intr_info_field is cleared on exit. Emulate this * instead of reading the real value. */ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; /* * Transfer the event that L0 or L1 may wanted to inject into * L2 to IDT_VECTORING_INFO_FIELD. */ vmcs12_save_pending_event(vcpu, vmcs12, vm_exit_reason, exit_intr_info); vmcs12->vm_exit_intr_info = exit_intr_info; vmcs12->vm_exit_instruction_len = exit_insn_len; vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* * According to spec, there's no need to store the guest's * MSRs if the exit is due to a VM-entry failure that occurs * during or after loading the guest state. Since this exit * does not fall in that category, we need to save the MSRs. */ if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); } } /* * A part of what we need to when the nested L2 guest exits and we want to * run its L1 parent, is to reset L1's guest state to the host state specified * in vmcs12. * This function is to be called not only on normal nested exit, but also on * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry * Failures During or After Loading Guest State"). * This function should be called when the active VMCS is L1's (vmcs01). */ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { enum vm_entry_failure_code ignored; struct kvm_segment seg; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) vcpu->arch.efer |= (EFER_LMA | EFER_LME); else vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); vmx_set_efer(vcpu, vcpu->arch.efer); kvm_rsp_write(vcpu, vmcs12->host_rsp); kvm_rip_write(vcpu, vmcs12->host_rip); vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); vmx_set_interrupt_shadow(vcpu, 0); /* * Note that calling vmx_set_cr0 is important, even if cr0 hasn't * actually changed, because vmx_set_cr0 refers to efer set above. * * CR0_GUEST_HOST_MASK is already set in the original vmcs01 * (KVM doesn't change it); */ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmx_set_cr0(vcpu, vmcs12->host_cr0); /* Same as above - no reason to call set_cr4_guest_host_mask(). */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); vmx_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); /* * Only PDPTE load can fail as the value of cr3 was checked on entry and * couldn't have changed. */ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) vmcs_write64(GUEST_BNDCFGS, 0); /* * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. * otherwise CET state should be retained across VM-exit, i.e., * guest values should be propagated from vmcs12 to vmcs01. */ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, vmcs12->host_ssp_tbl); else vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ seg = (struct kvm_segment) { .base = 0, .limit = 0xFFFFFFFF, .selector = vmcs12->host_cs_selector, .type = 11, .present = 1, .s = 1, .g = 1 }; if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) seg.l = 1; else seg.db = 1; __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); seg = (struct kvm_segment) { .base = 0, .limit = 0xFFFFFFFF, .type = 3, .present = 1, .s = 1, .db = 1, .g = 1 }; seg.selector = vmcs12->host_ds_selector; __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); seg.selector = vmcs12->host_es_selector; __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); seg.selector = vmcs12->host_ss_selector; __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); seg.selector = vmcs12->host_fs_selector; seg.base = vmcs12->host_fs_base; __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); seg.selector = vmcs12->host_gs_selector; seg.base = vmcs12->host_gs_base; __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); seg = (struct kvm_segment) { .base = vmcs12->host_tr_base, .limit = 0x67, .selector = vmcs12->host_tr_selector, .type = 11, .present = 1 }; __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); memset(&seg, 0, sizeof(seg)); seg.unusable = 1; __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); vmx_guest_debugctl_write(vcpu, 0); if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) { struct vmx_uret_msr *efer_msr; unsigned int i; if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) return vmcs_read64(GUEST_IA32_EFER); if (cpu_has_load_ia32_efer()) return kvm_host.efer; for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) return vmx->msr_autoload.guest.val[i].value; } efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); if (efer_msr) return efer_msr->data; return kvm_host.efer; } static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_msr_entry g, h; gpa_t gpa; u32 i, j; vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { /* * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set * as vmcs01.GUEST_DR7 contains a userspace defined value * and vcpu->arch.dr7 is not squirreled away before the * nested VMENTER (not worth adding a variable in nested_vmx). */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) kvm_set_dr(vcpu, 7, DR7_FIXED_1); else WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); } /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ vmx_reload_guest_debugctl(vcpu); /* * Note that calling vmx_set_{efer,cr0,cr4} is important as they * handle a variety of side effects to KVM's software model. */ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); nested_ept_uninit_mmu_context(vcpu); vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs * from vmcs01 (if necessary). The PDPTRs are not loaded on * VMFail, like everything else we just need to ensure our * software model is up-to-date. */ if (enable_ept && is_pae_paging(vcpu)) ept_save_pdptrs(vcpu); kvm_mmu_reset_context(vcpu); /* * This nasty bit of open coding is a compromise between blindly * loading L1's MSRs using the exit load lists (incorrect emulation * of VMFail), leaving the nested VM's MSRs in the software model * (incorrect behavior) and snapshotting the modified MSRs (too * expensive since the lists are unbound by hardware). For each * MSR that was (prematurely) loaded from the nested VMEntry load * list, reload it from the exit load list if it exists and differs * from the guest value. The intent is to stuff host state as * silently as possible, not to fully process the exit load list. */ for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { pr_debug_ratelimited( "%s read MSR index failed (%u, 0x%08llx)\n", __func__, i, gpa); goto vmabort; } for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { pr_debug_ratelimited( "%s read MSR failed (%u, 0x%08llx)\n", __func__, j, gpa); goto vmabort; } if (h.index != g.index) continue; if (h.value == g.value) break; if (nested_vmx_load_msr_check(vcpu, &h)) { pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, j, h.index, h.reserved); goto vmabort; } if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); goto vmabort; } } } return; vmabort: nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); } /* * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification, u32 exit_insn_len) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* Pending MTF traps are discarded on VM-Exit. */ vmx->nested.mtf_pending = false; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); #ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { /* * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map * Enlightened VMCS after migration and we still need to * do that when something is forcing L2->L1 exit prior to * the first L2 run. */ (void)nested_get_evmcs_page(vcpu); } #endif /* Service pending TLB flush requests for L2 before switching to L1. */ kvm_service_local_tlb_flush_requests(vcpu); /* * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are * up-to-date before switching to L1. */ if (enable_ept && is_pae_paging(vcpu)) vmx_ept_load_pdptrs(vcpu); leave_guest_mode(vcpu); if (nested_cpu_has_preemption_timer(vmcs12)) hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; } if (likely(!vmx->fail)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); if (vm_exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, exit_intr_info, exit_qualification, exit_insn_len); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will * also be used to capture vmcs12 cache as part of * capturing nVMX state for snapshot (migration). * * Otherwise, this flush will dirty guest memory at a * point it is already assumed by user-space to be * immutable. */ nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); } else { /* * The only expected VM-instruction error is "VM entry with * invalid control field(s)." Anything else indicates a * problem with L0. */ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); /* VM-Fail at VM-Entry means KVM missed a consistency check. */ WARN_ON_ONCE(warn_on_missed_cc); } /* * Drop events/exceptions that were queued for re-injection to L2 * (picked up via vmx_complete_interrupts()), as well as exceptions * that were pending for L2. Note, this must NOT be hoisted above * prepare_vmcs12(), events/exceptions queued for re-injection need to * be captured in vmcs12 (see vmcs12_save_pending_event()). */ vcpu->arch.nmi_injected = false; kvm_clear_exception_queue(vcpu); kvm_clear_interrupt_queue(vcpu); vmx_switch_vmcs(vcpu, &vmx->vmcs01); kvm_nested_vmexit_handle_ibrs(vcpu); /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); if (vmx->nested.l1_tpr_threshold != -1) vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); if (vmx->nested.change_vmcs01_virtual_apic_mode) { vmx->nested.change_vmcs01_virtual_apic_mode = false; vmx_set_virtual_apic_mode(vcpu); } if (vmx->nested.update_vmcs01_cpu_dirty_logging) { vmx->nested.update_vmcs01_cpu_dirty_logging = false; vmx_update_cpu_dirty_logging(vcpu); } nested_put_vmcs12_pages(vcpu); if (vmx->nested.reload_vmcs01_apic_access_page) { vmx->nested.reload_vmcs01_apic_access_page = false; kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); } if (vmx->nested.update_vmcs01_apicv_status) { vmx->nested.update_vmcs01_apicv_status = false; vmx_refresh_apicv_exec_ctrl(vcpu); } if (vmx->nested.update_vmcs01_hwapic_isr) { vmx->nested.update_vmcs01_hwapic_isr = false; kvm_apic_update_hwapic_isr(vcpu); } if ((vm_exit_reason != -1) && (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); if (likely(!vmx->fail)) { if (vm_exit_reason != -1) trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, vmcs12->idt_vectoring_info_field, vmcs12->vm_exit_intr_info, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); load_vmcs12_host_state(vcpu, vmcs12); /* * Process events if an injectable IRQ or NMI is pending, even * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). * If an event became pending while L2 was active, KVM needs to * either inject the event or request an IRQ/NMI window. SMIs * don't need to be processed as SMM is mutually exclusive with * non-root mode. INIT/SIPI don't need to be checked as INIT * is blocked post-VMXON, and SIPIs are ignored. */ if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) kvm_make_request(KVM_REQ_EVENT, vcpu); return; } /* * After an early L2 VM-entry failure, we're now back * in L1 which thinks it just finished a VMLAUNCH or * VMRESUME instruction, so we need to set the failure * flag and the VM-instruction error field of the VMCS * accordingly, and skip the emulated instruction. */ (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); /* * Restore L1's host state to KVM's software model. We're here * because a consistency check was caught by hardware, which * means some amount of guest state has been propagated to KVM's * model and needs to be unwound to the host's state. */ nested_vmx_restore_host_state(vcpu); vmx->fail = 0; } static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) { kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); } /* * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). * On success, returns 0. When the operand is invalid, returns 1 and throws * #UD, #GP, or #SS. */ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret) { gva_t off; bool exn; struct kvm_segment s; /* * According to Vol. 3B, "Information for VM Exits Due to Instruction * Execution", on an exit, vmx_instruction_info holds most of the * addressing components of the operand. Only the displacement part * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). * For how an actual address is calculated from all these components, * refer to Vol. 1, "Operand Addressing". */ int scaling = vmx_instruction_info & 3; int addr_size = (vmx_instruction_info >> 7) & 7; bool is_reg = vmx_instruction_info & (1u << 10); int seg_reg = (vmx_instruction_info >> 15) & 7; int index_reg = (vmx_instruction_info >> 18) & 0xf; bool index_is_valid = !(vmx_instruction_info & (1u << 22)); int base_reg = (vmx_instruction_info >> 23) & 0xf; bool base_is_valid = !(vmx_instruction_info & (1u << 27)); if (is_reg) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ off = exit_qualification; /* holds the displacement */ if (addr_size == 1) off = (gva_t)sign_extend64(off, 31); else if (addr_size == 0) off = (gva_t)sign_extend64(off, 15); if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) off += kvm_register_read(vcpu, index_reg) << scaling; vmx_get_segment(vcpu, &s, seg_reg); /* * The effective address, i.e. @off, of a memory operand is truncated * based on the address size of the instruction. Note that this is * the *effective address*, i.e. the address prior to accounting for * the segment's base. */ if (addr_size == 1) /* 32 bit */ off &= 0xffffffff; else if (addr_size == 0) /* 16 bit */ off &= 0xffff; /* Checks for #GP/#SS exceptions. */ exn = false; if (is_long_mode(vcpu)) { /* * The virtual/linear address is never truncated in 64-bit * mode, e.g. a 32-bit address size can yield a 64-bit virtual * address when using FS/GS with a non-zero base. */ if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) *ret = s.base + off; else *ret = off; *ret = vmx_get_untagged_addr(vcpu, *ret, 0); /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory * destination for long mode! */ exn = is_noncanonical_address(*ret, vcpu, 0); } else { /* * When not in long mode, the virtual/linear address is * unconditionally truncated to 32 bits regardless of the * address size. */ *ret = (s.base + off) & 0xffffffff; /* Protected mode: apply checks for segment validity in the * following order: * - segment type check (#GP(0) may be thrown) * - usability check (#GP(0)/#SS(0)) * - limit check (#GP(0)/#SS(0)) */ if (wr) /* #GP(0) if the destination operand is located in a * read-only data segment or any code segment. */ exn = ((s.type & 0xa) == 0 || (s.type & 8)); else /* #GP(0) if the source operand is located in an * execute-only code segment */ exn = ((s.type & 0xa) == 8); if (exn) { kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return 1; } /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); /* * Protected mode: #GP(0)/#SS(0) if the memory operand is * outside the segment limit. All CPUs that support VMX ignore * limit checks for flat segments, i.e. segments with base==0, * limit==0xffffffff and of type expand-up data or code. */ if (!(s.base == 0 && s.limit == 0xffffffff && ((s.type & 8) || !(s.type & 4)))) exn = exn || ((u64)off + len - 1 > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, seg_reg == VCPU_SREG_SS ? SS_VECTOR : GP_VECTOR, 0); return 1; } return 0; } static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, int *ret) { gva_t gva; struct x86_exception e; int r; if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmcs_read32(VMX_INSTRUCTION_INFO), false, sizeof(*vmpointer), &gva)) { *ret = 1; return -EINVAL; } r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); if (r != X86EMUL_CONTINUE) { *ret = kvm_handle_memory_failure(vcpu, r, &e); return -EINVAL; } return 0; } /* * Allocate a shadow VMCS and associate it with the currently loaded * VMCS, unless such a shadow VMCS already exists. The newly allocated * VMCS is also VMCLEARed, so that it is ready for use. */ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; /* * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it * when L1 executes VMXOFF or the vCPU is forced out of nested * operation. VMXON faults if the CPU is already post-VMXON, so it * should be impossible to already have an allocated shadow VMCS. KVM * doesn't support virtualization of VMCS shadowing, so vmcs01 should * always be the loaded VMCS. */ if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) return loaded_vmcs->shadow_vmcs; loaded_vmcs->shadow_vmcs = alloc_vmcs(true); if (loaded_vmcs->shadow_vmcs) vmcs_clear(loaded_vmcs->shadow_vmcs); return loaded_vmcs->shadow_vmcs; } static int enter_vmx_operation(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int r; r = alloc_loaded_vmcs(&vmx->nested.vmcs02); if (r < 0) goto out_vmcs02; vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) goto out_shadow_vmcs; hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); vmx->nested.vpid02 = allocate_vpid(); vmx->nested.vmcs02_initialized = false; vmx->nested.vmxon = true; if (vmx_pt_mode_is_host_guest()) { vmx->pt_desc.guest.ctl = 0; pt_update_intercept_for_msr(vcpu); } return 0; out_shadow_vmcs: kfree(vmx->nested.cached_shadow_vmcs12); out_cached_shadow_vmcs12: kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: free_loaded_vmcs(&vmx->nested.vmcs02); out_vmcs02: return -ENOMEM; } /* Emulate the VMXON instruction. */ static int handle_vmxon(struct kvm_vcpu *vcpu) { int ret; gpa_t vmptr; uint32_t revision; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter * the guest and so cannot rely on hardware to perform the check, * which has higher priority than VM-Exit (see Intel SDM's pseudocode * for VMXON). * * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't * force any of the relevant guest state. For a restricted guest, KVM * does force CR0.PE=1, but only to also force VM86 in order to emulate * Real Mode, and so there's no need to check CR0.PE manually. */ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } /* * The CPL is checked for "not in VMX operation" and for "in VMX root", * and has higher priority than the VM-Fail due to being post-VMXON, * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits * from L2 to L1, i.e. there's no need to check for the vCPU being in * VMX non-root. * * Forwarding the VM-Exit unconditionally, i.e. without performing the * #UD checks (see above), is functionally ok because KVM doesn't allow * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are * missed by hardware due to shadowing CR0 and/or CR4. */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); return 1; } if (vmx->nested.vmxon) return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); /* * Invalid CR0/CR4 generates #GP. These checks are performed if and * only if the vCPU isn't already in VMX operation, i.e. effectively * have lower priority than the VM-Fail above. */ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { kvm_inject_gp(vcpu, 0); return 1; } if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); return 1; } if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) return ret; /* * SDM 3: 24.11.5 * The first 4 bytes of VMXON region contain the supported * VMCS revision identifier * * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; * which replaces physical address width with 32 */ if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failInvalid(vcpu); if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || revision != VMCS12_REVISION) return nested_vmx_failInvalid(vcpu); vmx->nested.vmxon_ptr = vmptr; ret = enter_vmx_operation(vcpu); if (ret) return ret; return nested_vmx_succeed(vcpu); } static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->nested.current_vmptr == INVALID_GPA) return; copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (enable_shadow_vmcs) { /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; /* Flush VMCS12 to guest memory */ kvm_vcpu_write_guest_page(vcpu, vmx->nested.current_vmptr >> PAGE_SHIFT, vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); vmx->nested.current_vmptr = INVALID_GPA; } /* Emulate the VMXOFF instruction */ static int handle_vmxoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; free_nested(vcpu); if (kvm_apic_has_pending_init_or_sipi(vcpu)) kvm_make_request(KVM_REQ_EVENT, vcpu); return nested_vmx_succeed(vcpu); } /* Emulate the VMCLEAR instruction */ static int handle_vmclear(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 zero = 0; gpa_t vmptr; int r; if (!nested_vmx_check_permission(vcpu)) return 1; if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) return r; if (!page_address_valid(vcpu, vmptr)) return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); if (vmptr == vmx->nested.vmxon_ptr) return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); /* * Silently ignore memory errors on VMCLEAR, Intel's pseudocode * for VMCLEAR includes a "ensure that data for VMCS referenced * by the operand is in memory" clause that guards writes to * memory, i.e. doing nothing for I/O is architecturally valid. * * FIXME: Suppress failures if and only if no memslot is found, * i.e. exit to userspace if __copy_to_user() fails. */ (void)kvm_vcpu_write_guest(vcpu, vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); } return nested_vmx_succeed(vcpu); } /* Emulate the VMLAUNCH instruction */ static int handle_vmlaunch(struct kvm_vcpu *vcpu) { return nested_vmx_run(vcpu, true); } /* Emulate the VMRESUME instruction */ static int handle_vmresume(struct kvm_vcpu *vcpu) { return nested_vmx_run(vcpu, false); } static int handle_vmread(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; unsigned long field; u64 value; gva_t gva = 0; short offset; int len, r; if (!nested_vmx_check_permission(vcpu)) return 1; /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); if (!nested_vmx_is_evmptr12_valid(vmx)) { /* * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMREAD sets the ALU flags for VMfailInvalid. */ if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); offset = get_vmcs12_field_offset(field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); /* Read the field, zero-extended to a u64 value */ value = vmcs12_read_any(vmcs12, field, offset); } else { /* * Hyper-V TLFS (as of 6.0b) explicitly states, that while an * enlightened VMCS is active VMREAD/VMWRITE instructions are * unsupported. Unfortunately, certain versions of Windows 11 * don't comply with this requirement which is not enforced in * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a * workaround, as misbehaving guests will panic on VM-Fail. * Note, enlightened VMCS is incompatible with shadow VMCS so * all VMREADs from L2 should go to L1. */ if (WARN_ON_ONCE(is_guest_mode(vcpu))) return nested_vmx_failInvalid(vcpu); offset = evmcs_field_offset(field, NULL); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* Read the field, zero-extended to a u64 value */ value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); } /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending * on the guest's mode (32 or 64 bit), not on the given field's length. */ if (instr_info & BIT(10)) { kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); } else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, instr_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); } return nested_vmx_succeed(vcpu); } static bool is_shadow_field_rw(unsigned long field) { switch (field) { #define SHADOW_FIELD_RW(x, y) case x: #include "vmcs_shadow_fields.h" return true; default: break; } return false; } static bool is_shadow_field_ro(unsigned long field) { switch (field) { #define SHADOW_FIELD_RO(x, y) case x: #include "vmcs_shadow_fields.h" return true; default: break; } return false; } static int handle_vmwrite(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; unsigned long field; short offset; gva_t gva; int len, r; /* * The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 * bit (value), and then copies only the appropriate number of * bits into the vmcs12 field. */ u64 value = 0; if (!nested_vmx_check_permission(vcpu)) return 1; /* * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMWRITE sets the ALU flags for VMfailInvalid. */ if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); if (instr_info & BIT(10)) value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, instr_info, false, len, &gva)) return 1; r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); } field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); offset = get_vmcs12_field_offset(field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* * If the vCPU supports "VMWRITE to any supported field in the * VMCS," then the "read-only" fields are actually read/write. */ if (vmcs_field_readonly(field) && !nested_cpu_has_vmwrite_any_field(vcpu)) return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); /* * Ensure vmcs12 is up-to-date before any VMWRITE that dirties * vmcs12, else we may crush a field or consume a stale value. */ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); /* * Some Intel CPUs intentionally drop the reserved bits of the AR byte * fields on VMWRITE. Emulate this behavior to ensure consistent KVM * behavior regardless of the underlying hardware, e.g. if an AR_BYTE * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD * from L1 will return a different value than VMREAD from L2 (L1 sees * the stripped down value, L2 sees the full value as stored by KVM). */ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) value &= 0x1f0ff; vmcs12_write_any(vmcs12, field, offset, value); /* * Do not track vmcs12 dirty-state if in guest-mode as we actually * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated * by L1 without a vmexit are always updated in the vmcs02, i.e. don't * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. */ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { /* * L1 can read these fields without exiting, ensure the * shadow VMCS is up-to-date. */ if (enable_shadow_vmcs && is_shadow_field_ro(field)) { preempt_disable(); vmcs_load(vmx->vmcs01.shadow_vmcs); __vmcs_writel(field, value); vmcs_clear(vmx->vmcs01.shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); preempt_enable(); } vmx->nested.dirty_vmcs12 = true; } return nested_vmx_succeed(vcpu); } static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) { vmx->nested.current_vmptr = vmptr; if (enable_shadow_vmcs) { secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->vmcs01.shadow_vmcs)); vmx->nested.need_vmcs12_to_shadow_sync = true; } vmx->nested.dirty_vmcs12 = true; vmx->nested.force_msr_bitmap_recalc = true; } /* Emulate the VMPTRLD instruction */ static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t vmptr; int r; if (!nested_vmx_check_permission(vcpu)) return 1; if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) return r; if (!page_address_valid(vcpu, vmptr)) return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); if (vmptr == vmx->nested.vmxon_ptr) return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); /* Forbid normal VMPTRLD if Enlightened version was used */ if (nested_vmx_is_evmptr12_valid(vmx)) return 1; if (vmx->nested.current_vmptr != vmptr) { struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; struct vmcs_hdr hdr; if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the * given physical address won't match the required * VMCS12_REVISION identifier. */ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, offsetof(struct vmcs12, hdr), sizeof(hdr))) { return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } if (hdr.revision_id != VMCS12_REVISION || (hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } nested_release_vmcs12(vcpu); /* * Load VMCS12 from guest memory since it is not already * cached. */ if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, VMCS12_SIZE)) { return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } set_current_vmptr(vmx, vmptr); } return nested_vmx_succeed(vcpu); } /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { unsigned long exit_qual = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; gva_t gva; int r; if (!nested_vmx_check_permission(vcpu)) return 1; if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) return 1; if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, sizeof(gpa_t), &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, sizeof(gpa_t), &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); return nested_vmx_succeed(vcpu); } /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; unsigned long type, roots_to_free; struct kvm_mmu *mmu; gva_t gva; struct x86_exception e; struct { u64 eptp, gpa; } operand; int i, r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } if (!nested_vmx_check_permission(vcpu)) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (type >= 32 || !(types & (1 << type))) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); /* * Nested EPT roots are always held through guest_mmu, * not root_mmu. */ mmu = &vcpu->arch.guest_mmu; switch (type) { case VMX_EPT_EXTENT_CONTEXT: if (!nested_vmx_check_eptp(vcpu, operand.eptp)) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); roots_to_free = 0; if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, operand.eptp)) roots_to_free |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (nested_ept_root_matches(mmu->prev_roots[i].hpa, mmu->prev_roots[i].pgd, operand.eptp)) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } break; case VMX_EPT_EXTENT_GLOBAL: roots_to_free = KVM_MMU_ROOTS_ALL; break; default: BUG(); break; } if (roots_to_free) kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); return nested_vmx_succeed(vcpu); } static int handle_invvpid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info; unsigned long type, types; gva_t gva; struct x86_exception e; struct { u64 vpid; u64 gla; } operand; u16 vpid02; int r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } if (!nested_vmx_check_permission(vcpu)) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); if (operand.vpid >> 16) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* * Always flush the effective vpid02, i.e. never flush the current VPID * and never explicitly flush vpid01. INVVPID targets a VPID, not a * VMCS, and so whether or not the current vmcs12 has VPID enabled is * irrelevant (and there may not be a loaded vmcs12). */ vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: /* * LAM doesn't apply to addresses that are inputs to TLB * invalidation. */ if (!operand.vpid || is_noncanonical_invlpg_address(operand.gla, vcpu)) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_vcpu_addr(vpid02, operand.gla); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: if (!operand.vpid) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_context(vpid02); break; case VMX_VPID_EXTENT_ALL_CONTEXT: vpid_sync_context(vpid02); break; default: WARN_ON_ONCE(1); return kvm_skip_emulated_instruction(vcpu); } /* * Sync the shadow page tables if EPT is disabled, L1 is invalidating * linear mappings for L2 (tagged with L2's VPID). Free all guest * roots as VPIDs are not tracked in the MMU role. * * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share * an MMU when EPT is disabled. * * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. */ if (!enable_ept) kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); return nested_vmx_succeed(vcpu); } static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { u32 index = kvm_rcx_read(vcpu); u64 new_eptp; if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) return 1; if (index >= VMFUNC_EPTP_ENTRIES) return 1; if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, &new_eptp, index * 8, 8)) return 1; /* * If the (L2) guest does a vmfunc to the currently * active ept pointer, we don't have to do anything else */ if (vmcs12->ept_pointer != new_eptp) { if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1; vmcs12->ept_pointer = new_eptp; nested_ept_new_eptp(vcpu); if (!nested_cpu_has_vpid(vmcs12)) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } return 0; } static int handle_vmfunc(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; u32 function = kvm_rax_read(vcpu); /* * VMFUNC should never execute cleanly while L1 is active; KVM supports * VMFUNC for nested VMs, but not for L1. */ if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } vmcs12 = get_vmcs12(vcpu); /* * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC * is enabled in vmcs02 if and only if it's enabled in vmcs12. */ if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } if (!(vmcs12->vm_function_control & BIT_ULL(function))) goto fail; switch (function) { case 0: if (nested_vmx_eptp_switching(vcpu, vmcs12)) goto fail; break; default: goto fail; } return kvm_skip_emulated_instruction(vcpu); fail: /* * This is effectively a reflected VM-Exit, as opposed to a synthesized * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode * EXIT_REASON_VMFUNC as the exit reason. */ nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; } /* * Return true if an IO instruction with the specified port and size should cause * a VM-exit into L1. */ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); gpa_t bitmap, last_bitmap; u8 b; last_bitmap = INVALID_GPA; b = -1; while (size > 0) { if (port < 0x8000) bitmap = vmcs12->io_bitmap_a; else if (port < 0x10000) bitmap = vmcs12->io_bitmap_b; else return true; bitmap += (port & 0x7fff) / 8; if (last_bitmap != bitmap) if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) return true; if (b & (1 << (port & 7))) return true; port++; size--; last_bitmap = bitmap; } return false; } static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { unsigned long exit_qualification; unsigned short port; int size; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); exit_qualification = vmx_get_exit_qual(vcpu); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; return nested_vmx_check_io_bitmaps(vcpu, port, size); } /* * Return 1 if we should exit from L2 to L1 to handle an MSR access, * rather than handle it ourselves in L0. I.e., check whether L1 expressed * disinterest in the current event (read or write a specific MSR) by using an * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. */ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, union vmx_exit_reason exit_reason) { u32 msr_index; gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return true; if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) msr_index = vmx_get_exit_qual(vcpu); else msr_index = kvm_rcx_read(vcpu); /* * The MSR_BITMAP page is divided into four 1024-byte bitmaps, * for the four combinations of read/write and low/high MSR numbers. * First we need to figure out which of the four to use: */ bitmap = vmcs12->msr_bitmap; if (exit_reason.basic == EXIT_REASON_MSR_WRITE || exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) bitmap += 2048; if (msr_index >= 0xc0000000) { msr_index -= 0xc0000000; bitmap += 1024; } /* Then read the msr_index'th bit from this bitmap: */ if (msr_index < 1024*8) { unsigned char b; if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) return true; return 1 & (b >> (msr_index & 7)); } else return true; /* let L1 handle the wrong parameter */ } /* * Return 1 if we should exit from L2 to L1 to handle a CR access exit, * rather than handle it ourselves in L0. I.e., check if L1 wanted to * intercept (via guest_host_mask etc.) the current event. */ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int cr = exit_qualification & 15; int reg; unsigned long val; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ reg = (exit_qualification >> 8) & 15; val = kvm_register_read(vcpu, reg); switch (cr) { case 0: if (vmcs12->cr0_guest_host_mask & (val ^ vmcs12->cr0_read_shadow)) return true; break; case 3: if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) return true; break; case 4: if (vmcs12->cr4_guest_host_mask & (vmcs12->cr4_read_shadow ^ val)) return true; break; case 8: if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) return true; break; } break; case 2: /* clts */ if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && (vmcs12->cr0_read_shadow & X86_CR0_TS)) return true; break; case 1: /* mov from cr */ switch (cr) { case 3: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_CR3_STORE_EXITING) return true; break; case 8: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_CR8_STORE_EXITING) return true; break; } break; case 3: /* lmsw */ /* * lmsw can change bits 1..3 of cr0, and only set bit 0 of * cr0. Other attempted changes are ignored, with no exit. */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; if (vmcs12->cr0_guest_host_mask & 0xe & (val ^ vmcs12->cr0_read_shadow)) return true; if ((vmcs12->cr0_guest_host_mask & 0x1) && !(vmcs12->cr0_read_shadow & 0x1) && (val & 0x1)) return true; break; } return false; } static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { u32 encls_leaf; if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) return false; encls_leaf = kvm_rax_read(vcpu); if (encls_leaf > 62) encls_leaf = 63; return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); } static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, gpa_t bitmap) { u32 vmx_instruction_info; unsigned long field; u8 b; if (!nested_cpu_has_shadow_vmcs(vmcs12)) return true; /* Decode instruction info and find the field to access */ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Out-of-range fields always cause a VM exit from L2 to L1 */ if (field >> 15) return true; if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) return true; return 1 & (b >> (field & 7)); } static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) { u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; if (nested_cpu_has_mtf(vmcs12)) return true; /* * An MTF VM-exit may be injected into the guest by setting the * interruption-type to 7 (other event) and the vector field to 0. Such * is the case regardless of the 'monitor trap flag' VM-execution * control. */ return entry_intr_info == (INTR_INFO_VALID_MASK | INTR_TYPE_OTHER_EVENT); } /* * Return true if L0 wants to handle an exit from L2 regardless of whether or not * L1 wants the exit. Only call this when in is_guest_mode (L2). */ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, union vmx_exit_reason exit_reason) { u32 intr_info; switch ((u16)exit_reason.basic) { case EXIT_REASON_EXCEPTION_NMI: intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) return vcpu->arch.apf.host_apf_flags || vmx_need_pf_intercept(vcpu); else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) return true; else if (is_breakpoint(intr_info) && vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return true; else if (is_alignment_check(intr_info) && !vmx_guest_inject_ac(vcpu)) return true; else if (is_ve_fault(intr_info)) return true; return false; case EXIT_REASON_EXTERNAL_INTERRUPT: return true; case EXIT_REASON_MCE_DURING_VMENTRY: return true; case EXIT_REASON_EPT_VIOLATION: /* * L0 always deals with the EPT violation. If nested EPT is * used, and the nested mmu code discovers that the address is * missing in the guest EPT table (EPT12), the EPT violation * will be injected with nested_ept_inject_page_fault() */ return true; case EXIT_REASON_EPT_MISCONFIG: /* * L2 never uses directly L1's EPT, but rather L0's own EPT * table (shadow on EPT) or a merged EPT table that L0 built * (EPT on EPT). So any problems with the structure of the * table is L0's fault. */ return true; case EXIT_REASON_PREEMPTION_TIMER: return true; case EXIT_REASON_PML_FULL: /* * PML is emulated for an L1 VMM and should never be enabled in * vmcs02, always "handle" PML_FULL by exiting to userspace. */ return true; case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return true; case EXIT_REASON_BUS_LOCK: /* * At present, bus lock VM exit is never exposed to L1. * Handle L2's bus locks in L0 directly. */ return true; #ifdef CONFIG_KVM_HYPERV case EXIT_REASON_VMCALL: /* Hyper-V L2 TLB flush hypercall is handled by L0 */ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && nested_evmcs_l2_tlb_flush_enabled(vcpu) && kvm_hv_is_tlb_flush_hcall(vcpu); #endif default: break; } return false; } /* * Return 1 if L1 wants to intercept an exit from L2. Only call this when in * is_guest_mode (L2). */ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, union vmx_exit_reason exit_reason) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 intr_info; switch ((u16)exit_reason.basic) { case EXIT_REASON_EXCEPTION_NMI: intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) return true; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: return nested_exit_on_intr(vcpu); case EXIT_REASON_TRIPLE_FAULT: return true; case EXIT_REASON_INTERRUPT_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); case EXIT_REASON_NMI_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: return true; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); case EXIT_REASON_INVD: return true; case EXIT_REASON_INVLPG: return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); case EXIT_REASON_RDRAND: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); case EXIT_REASON_RDSEED: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMREAD: return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, vmcs12->vmread_bitmap); case EXIT_REASON_VMWRITE: return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, vmcs12->vmwrite_bitmap); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! */ return true; case EXIT_REASON_CR_ACCESS: return nested_vmx_exit_handled_cr(vcpu, vmcs12); case EXIT_REASON_DR_ACCESS: return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); case EXIT_REASON_IO_INSTRUCTION: return nested_vmx_exit_handled_io(vcpu, vmcs12); case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: case EXIT_REASON_MSR_READ_IMM: case EXIT_REASON_MSR_WRITE_IMM: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); case EXIT_REASON_INVALID_STATE: return true; case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); case EXIT_REASON_MONITOR_TRAP_FLAG: return nested_vmx_exit_handled_mtf(vmcs12); case EXIT_REASON_MONITOR_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); case EXIT_REASON_PAUSE_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || nested_cpu_has2(vmcs12, SECONDARY_EXEC_PAUSE_LOOP_EXITING); case EXIT_REASON_MCE_DURING_VMENTRY: return true; case EXIT_REASON_TPR_BELOW_THRESHOLD: return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: case EXIT_REASON_APIC_WRITE: case EXIT_REASON_EOI_INDUCED: /* * The controls for "virtualize APIC accesses," "APIC- * register virtualization," and "virtual-interrupt * delivery" only come from vmcs12. */ return true; case EXIT_REASON_INVPCID: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: return true; case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: /* * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap * verbatim, i.e. any exit is due to L1's bitmap. WARN if * XSAVES isn't enabled, as the CPU is supposed to inject #UD * in that case, before consulting the XSS-bitmap. */ WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); return true; case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); case EXIT_REASON_ENCLS: return nested_vmx_exit_handled_encls(vcpu, vmcs12); case EXIT_REASON_NOTIFY: /* Notify VM exit is not exposed to L1 */ return false; case EXIT_REASON_SEAMCALL: case EXIT_REASON_TDCALL: /* * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't * virtualized by KVM for L1 hypervisors, i.e. L1 should * never want or expect such an exit. */ return false; default: return true; } } /* * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was * reflected into L1. */ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); union vmx_exit_reason exit_reason = vmx->vt.exit_reason; unsigned long exit_qual; u32 exit_intr_info; WARN_ON_ONCE(vmx->nested.nested_run_pending); /* * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM * has already loaded L2's state. */ if (unlikely(vmx->fail)) { trace_kvm_nested_vmenter_failed( "hardware VM-instruction error: ", vmcs_read32(VM_INSTRUCTION_ERROR)); exit_intr_info = 0; exit_qual = 0; goto reflect_vmexit; } trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) return false; /* If L1 doesn't want the exit, handle it in L0. */ if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) return false; /* * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would * need to be synthesized by querying the in-kernel LAPIC, but external * interrupts are never reflected to L1 so it's a non-issue. */ exit_intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(exit_intr_info)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); } exit_qual = vmx_get_exit_qual(vcpu); reflect_vmexit: nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); return true; } static int vmx_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, u32 user_data_size) { struct vcpu_vmx *vmx; struct vmcs12 *vmcs12; struct kvm_nested_state kvm_state = { .flags = 0, .format = KVM_STATE_NESTED_FORMAT_VMX, .size = sizeof(kvm_state), .hdr.vmx.flags = 0, .hdr.vmx.vmxon_pa = INVALID_GPA, .hdr.vmx.vmcs12_pa = INVALID_GPA, .hdr.vmx.preemption_timer_deadline = 0, }; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = &user_kvm_nested_state->data.vmx[0]; if (!vcpu) return kvm_state.size + sizeof(*user_vmx_nested_state); vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; if (vmx_has_valid_vmcs12(vcpu)) { kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ if (nested_vmx_is_evmptr12_set(vmx)) kvm_state.flags |= KVM_STATE_NESTED_EVMCS; if (is_guest_mode(vcpu) && nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != INVALID_GPA) kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); } if (vmx->nested.smm.vmxon) kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; if (vmx->nested.smm.guest_mode) kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; if (is_guest_mode(vcpu)) { kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; if (vmx->nested.nested_run_pending) kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; if (vmx->nested.mtf_pending) kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; if (nested_cpu_has_preemption_timer(vmcs12) && vmx->nested.has_preemption_timer_deadline) { kvm_state.hdr.vmx.flags |= KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; kvm_state.hdr.vmx.preemption_timer_deadline = vmx->nested.preemption_timer_deadline; } } } if (user_data_size < kvm_state.size) goto out; if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) return -EFAULT; if (!vmx_has_valid_vmcs12(vcpu)) goto out; /* * When running L2, the authoritative vmcs12 state is in the * vmcs02. When running L1, the authoritative vmcs12 state is * in the shadow or enlightened vmcs linked to vmcs01, unless * need_vmcs12_to_shadow_sync is set, in which case, the authoritative * vmcs12 state is in the vmcs12 already. */ if (is_guest_mode(vcpu)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); } else { copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (!vmx->nested.need_vmcs12_to_shadow_sync) { if (nested_vmx_is_evmptr12_valid(vmx)) /* * L1 hypervisor is not obliged to keep eVMCS * clean fields data always up-to-date while * not in guest mode, 'hv_clean_fields' is only * supposed to be actual upon vmentry so we need * to ignore it here and do full copy. */ copy_enlightened_to_vmcs12(vmx, 0); else if (enable_shadow_vmcs) copy_shadow_to_vmcs12(vmx); } } BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); /* * Copy over the full allocated size of vmcs12 rather than just the size * of the struct. */ if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) return -EFAULT; if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != INVALID_GPA) { if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; } out: return kvm_state.size; } void vmx_leave_nested(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.nested_run_pending = 0; nested_vmx_vmexit(vcpu, -1, 0, 0); } free_nested(vcpu); } static int vmx_set_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; enum vm_entry_failure_code ignored; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = &user_kvm_nested_state->data.vmx[0]; int ret; if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) return -EINVAL; if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { if (kvm_state->hdr.vmx.smm.flags) return -EINVAL; if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) return -EINVAL; /* * KVM_STATE_NESTED_EVMCS used to signal that KVM should * enable eVMCS capability on vCPU. However, since then * code was changed such that flag signals vmcs12 should * be copied into eVMCS in guest memory. * * To preserve backwards compatibility, allow user * to set this flag even when there is no VMXON region. */ if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return -EINVAL; if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) return -EINVAL; } if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return -EINVAL; if (kvm_state->hdr.vmx.smm.flags & ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) return -EINVAL; /* * SMM temporarily disables VMX, so we cannot be in guest mode, * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags * must be zero. */ if (is_smm(vcpu) ? (kvm_state->flags & (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) : kvm_state->hdr.vmx.smm.flags) return -EINVAL; if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; vmx_leave_nested(vcpu); if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) return 0; vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; ret = enter_vmx_operation(vcpu); if (ret) return ret; /* Empty 'VMXON' state is permitted if no VMCS loaded */ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { /* See vmx_has_valid_vmcs12. */ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) return -EINVAL; else return 0; } if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) return -EINVAL; set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); #ifdef CONFIG_KVM_HYPERV } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { /* * nested_vmx_handle_enlightened_vmptrld() cannot be called * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be * restored yet. EVMCS will be mapped from * nested_get_vmcs12_pages(). */ vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); #endif } else { return -EINVAL; } if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { vmx->nested.smm.vmxon = true; vmx->nested.vmxon = false; if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) vmx->nested.smm.guest_mode = true; } vmcs12 = get_vmcs12(vcpu); if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) return -EFAULT; if (vmcs12->hdr.revision_id != VMCS12_REVISION) return -EINVAL; if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return 0; vmx->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); vmx->nested.mtf_pending = !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); ret = -EINVAL; if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != INVALID_GPA) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); if (kvm_state->size < sizeof(*kvm_state) + sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) goto error_guest_mode; if (copy_from_user(shadow_vmcs12, user_vmx_nested_state->shadow_vmcs12, sizeof(*shadow_vmcs12))) { ret = -EFAULT; goto error_guest_mode; } if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || !shadow_vmcs12->hdr.shadow_vmcs) goto error_guest_mode; } vmx->nested.has_preemption_timer_deadline = false; if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { vmx->nested.has_preemption_timer_deadline = true; vmx->nested.preemption_timer_deadline = kvm_state->hdr.vmx.preemption_timer_deadline; } if (nested_vmx_check_controls(vcpu, vmcs12) || nested_vmx_check_host_state(vcpu, vmcs12) || nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) goto error_guest_mode; vmx->nested.dirty_vmcs12 = true; vmx->nested.force_msr_bitmap_recalc = true; ret = nested_vmx_enter_non_root_mode(vcpu, false); if (ret) goto error_guest_mode; if (vmx->nested.mtf_pending) kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; error_guest_mode: vmx->nested.nested_run_pending = 0; return ret; } void nested_vmx_set_vmcs_shadowing_bitmap(void) { if (enable_shadow_vmcs) { vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); } } /* * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo * that madness to get the encoding for comparison. */ #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) static u64 nested_vmx_calc_vmcs_enum_msr(void) { /* * Note these are the so called "index" of the VMCS field encoding, not * the index into vmcs12. */ unsigned int max_idx, idx; int i; /* * For better or worse, KVM allows VMREAD/VMWRITE to all fields in * vmcs12, regardless of whether or not the associated feature is * exposed to L1. Simply find the field with the highest index. */ max_idx = 0; for (i = 0; i < nr_vmcs12_fields; i++) { /* The vmcs12 table is very, very sparsely populated. */ if (!vmcs12_field_offsets[i]) continue; idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); if (idx > max_idx) max_idx = idx; } return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; } static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->pinbased_ctls_low = PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; msrs->pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | (enable_apicv ? PIN_BASED_POSTED_INTR : 0); msrs->pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; } static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; msrs->exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && !kvm_cpu_cap_has(X86_FEATURE_IBT)) msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; } static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; msrs->entry_ctls_high &= #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | VM_ENTRY_LOAD_CET_STATE; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && !kvm_cpu_cap_has(X86_FEATURE_IBT)) msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; } static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; msrs->procbased_ctls_high &= CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | #endif CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the * hardware. For example, L1 can specify an MSR bitmap - and we * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ msrs->procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ msrs->procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); } static void nested_vmx_setup_secondary_ctls(u32 ept_caps, struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; msrs->secondary_ctls_high &= SECONDARY_EXEC_DESC | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_RDSEED_EXITING | SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; /* * We can emulate "VMCS shadowing," even if the hardware * doesn't support it. */ msrs->secondary_ctls_high |= SECONDARY_EXEC_SHADOW_VMCS; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPT_PAGE_WALK_5_BIT | VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT | VMX_EPT_EXECUTE_ONLY_BIT; msrs->ept_caps &= ept_caps; msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_1GB_PAGE_BIT; if (enable_ept_ad_bits) { msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_PML; msrs->ept_caps |= VMX_EPT_AD_BIT; } /* * Advertise EPTP switching irrespective of hardware support, * KVM emulates it in software so long as VMFUNC is supported. */ if (cpu_has_vmx_vmfunc()) msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; } /* * Old versions of KVM use the single-context version without * checking for support, so declare that it is supported even * though it is treated as global context. The alternative is * not failing the single-context invvpid, and it is worse. */ if (enable_vpid) { msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_VPID; msrs->vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; } if (enable_unrestricted_guest) msrs->secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; if (flexpriority_enabled) msrs->secondary_ctls_high |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (enable_sgx) msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; } static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, struct nested_vmx_msrs *msrs) { msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; msrs->misc_low |= VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_WAIT_SIPI; msrs->misc_high = 0; } static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) { /* * This MSR reports some information about VMX support. We * should return information about the VMX we emulate for the * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, X86_MEMTYPE_WB); msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; if (cpu_has_vmx_basic_no_hw_errcode_cc()) msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; } static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) { /* * These MSRs specify bits which the guest must keep fixed on * while L1 is in VMXON mode (in L1's root mode, or running an L2). * We picked the standard core2 setting. */ #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) #define VMXON_CR4_ALWAYSON X86_CR4_VMXE msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; /* These MSRs specify bits which the guest must keep fixed off. */ rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); if (vmx_umip_emulated()) msrs->cr4_fixed1 |= X86_CR4_UMIP; } /* * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be * returned for the various VMX controls MSRs when nested VMX is enabled. * The same values should also be used to verify that vmcs12 control fields are * valid during nested entry from L1 to L2. * Each of these control msrs has a low and high 32-bit half: A low bit is on * if the corresponding bit in the (32-bit) control field *must* be on, and a * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). */ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) { struct nested_vmx_msrs *msrs = &vmcs_conf->nested; /* * Note that as a general rule, the high half of the MSRs (bits in * the control fields which may be 1) should be initialized by the * intersection of the underlying hardware's MSR (i.e., features which * can be supported) and the list of features we want to expose - * because they are known to be properly supported in our code. * Also, usually, the low half of the MSRs (bits which must be 1) can * be set to 0, meaning that L1 may turn off any of these bits. The * reason is that if one of these bits is necessary, it will appear * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control * fields of vmcs01 and vmcs02, will turn these bits off - and * nested_vmx_l1_wants_exit() will not pass related exits to L1. * These rules have exceptions below. */ nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); nested_vmx_setup_exit_ctls(vmcs_conf, msrs); nested_vmx_setup_entry_ctls(vmcs_conf, msrs); nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); nested_vmx_setup_misc_data(vmcs_conf, msrs); nested_vmx_setup_basic(msrs); nested_vmx_setup_cr_fixed(msrs); msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } void nested_vmx_hardware_unsetup(void) { int i; if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++) free_page((unsigned long)vmx_bitmap[i]); } } __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++) { /* * The vmx_bitmap is not tied to a VM and so should * not be charged to a memcg. */ vmx_bitmap[i] = (unsigned long *) __get_free_page(GFP_KERNEL); if (!vmx_bitmap[i]) { nested_vmx_hardware_unsetup(); return -ENOMEM; } } init_vmcs_shadow_fields(); } exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; exit_handlers[EXIT_REASON_VMON] = handle_vmxon; exit_handlers[EXIT_REASON_INVEPT] = handle_invept; exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; return 0; } struct kvm_x86_nested_ops vmx_nested_ops = { .leave_nested = vmx_leave_nested, .is_exception_vmexit = nested_vmx_is_exception_vmexit, .check_events = vmx_check_nested_events, .has_events = vmx_has_nested_events, .triple_fault = nested_vmx_triple_fault, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, .get_nested_state_pages = vmx_get_nested_state_pages, .write_log_dirty = nested_vmx_write_pml_buffer, #ifdef CONFIG_KVM_HYPERV .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, #endif }; |
| 6838 18 15 21520 1975 149 26 59 3383 24288 2053 24236 1 219 208 1025 12 42 4 1025 10 455 50 257 1315 7200 1706 1620 841 7 5108 6636 2179 267 23 162 1 327 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FORTIFY_STRING_H_ #define _LINUX_FORTIFY_STRING_H_ #include <linux/bitfield.h> #include <linux/bug.h> #include <linux/const.h> #include <linux/limits.h> #define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable #define __RENAME(x) __asm__(#x) #define FORTIFY_REASON_DIR(r) FIELD_GET(BIT(0), r) #define FORTIFY_REASON_FUNC(r) FIELD_GET(GENMASK(7, 1), r) #define FORTIFY_REASON(func, write) (FIELD_PREP(BIT(0), write) | \ FIELD_PREP(GENMASK(7, 1), func)) /* Overridden by KUnit tests. */ #ifndef fortify_panic # define fortify_panic(func, write, avail, size, retfail) \ __fortify_panic(FORTIFY_REASON(func, write), avail, size) #endif #ifndef fortify_warn_once # define fortify_warn_once(x...) WARN_ONCE(x) #endif #define FORTIFY_READ 0 #define FORTIFY_WRITE 1 #define EACH_FORTIFY_FUNC(macro) \ macro(strncpy), \ macro(strnlen), \ macro(strlen), \ macro(strscpy), \ macro(strlcat), \ macro(strcat), \ macro(strncat), \ macro(memset), \ macro(memcpy), \ macro(memmove), \ macro(memscan), \ macro(memcmp), \ macro(memchr), \ macro(memchr_inv), \ macro(kmemdup), \ macro(strcpy), \ macro(UNKNOWN), #define MAKE_FORTIFY_FUNC(func) FORTIFY_FUNC_##func enum fortify_func { EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC) }; void __fortify_report(const u8 reason, const size_t avail, const size_t size); void __fortify_panic(const u8 reason, const size_t avail, const size_t size) __cold __noreturn; void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)"); void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?"); void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)"); void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?"); #define __compiletime_strlen(p) \ ({ \ char *__p = (char *)(p); \ size_t __ret = SIZE_MAX; \ const size_t __p_size = __member_size(p); \ if (__p_size != SIZE_MAX && \ __builtin_constant_p(*__p)) { \ size_t __p_len = __p_size - 1; \ if (__builtin_constant_p(__p[__p_len]) && \ __p[__p_len] == '\0') \ __ret = __builtin_strlen(__p); \ } \ __ret; \ }) #if defined(__SANITIZE_ADDRESS__) #if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY) extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); #elif defined(CONFIG_KASAN_GENERIC) extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__asan_memset); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memmove); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memcpy); #else /* CONFIG_KASAN_SW_TAGS */ extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__hwasan_memset); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memmove); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memcpy); #endif extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); #else #if defined(__SANITIZE_MEMORY__) /* * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the * corresponding __msan_XXX functions. */ #include <linux/kmsan_string.h> #define __underlying_memcpy __msan_memcpy #define __underlying_memmove __msan_memmove #define __underlying_memset __msan_memset #else #define __underlying_memcpy __builtin_memcpy #define __underlying_memmove __builtin_memmove #define __underlying_memset __builtin_memset #endif #define __underlying_memchr __builtin_memchr #define __underlying_memcmp __builtin_memcmp #define __underlying_strcat __builtin_strcat #define __underlying_strcpy __builtin_strcpy #define __underlying_strlen __builtin_strlen #define __underlying_strncat __builtin_strncat #define __underlying_strncpy __builtin_strncpy #endif /** * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking * * @dst: Destination memory address to write to * @src: Source memory address to read from * @bytes: How many bytes to write to @dst from @src * @justification: Free-form text or comment describing why the use is needed * * This should be used for corner cases where the compiler cannot do the * right thing, or during transitions between APIs, etc. It should be used * very rarely, and includes a place for justification detailing where bounds * checking has happened, and why existing solutions cannot be employed. */ #define unsafe_memcpy(dst, src, bytes, justification) \ __underlying_memcpy(dst, src, bytes) /* * Clang's use of __builtin_*object_size() within inlines needs hinting via * __pass_*object_size(). The preference is to only ever use type 1 (member * size, rather than struct size), but there remain some stragglers using * type 0 that will be converted in the future. */ #if __has_builtin(__builtin_dynamic_object_size) #define POS __pass_dynamic_object_size(1) #define POS0 __pass_dynamic_object_size(0) #else #define POS __pass_object_size(1) #define POS0 __pass_object_size(0) #endif #define __compiletime_lessthan(bounds, length) ( \ __builtin_constant_p((bounds) < (length)) && \ (bounds) < (length) \ ) /** * strncpy - Copy a string to memory with non-guaranteed NUL padding * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * @size: bytes to write at @p * * If strlen(@q) >= @size, the copy of @q will stop after @size bytes, * and @p will NOT be NUL-terminated * * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes * will be written to @p until @size total bytes have been written. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * over-reads of @q, it cannot defend against writing unterminated * results to @p. Using strncpy() remains ambiguous and fragile. * Instead, please choose an alternative, so that the expectation * of @p's contents is unambiguous: * * +--------------------+--------------------+------------+ * | **p** needs to be: | padded to **size** | not padded | * +====================+====================+============+ * | NUL-terminated | strscpy_pad() | strscpy() | * +--------------------+--------------------+------------+ * | not NUL-terminated | strtomem_pad() | strtomem() | * +--------------------+--------------------+------------+ * * Note strscpy*()'s differing return values for detecting truncation, * and strtomem*()'s expectation that the destination is marked with * __nonstring when it is a character array. * */ __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3) char *strncpy(char * const POS p, const char *q, __kernel_size_t size) { const size_t p_size = __member_size(p); if (__compiletime_lessthan(p_size, size)) __write_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE, p_size, size, p); return __underlying_strncpy(p, q, size); } extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); /** * strnlen - Return bounded count of characters in a NUL-terminated string * * @p: pointer to NUL-terminated string to count. * @maxlen: maximum number of characters to count. * * Returns number of characters in @p (NOT including the final NUL), or * @maxlen, if no NUL has been found up to there. * */ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen) { const size_t p_size = __member_size(p); const size_t p_len = __compiletime_strlen(p); size_t ret; /* We can take compile-time actions when maxlen is const. */ if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) { /* If p is const, we can use its compile-time-known len. */ if (maxlen >= p_size) return p_len; } /* Do not check characters beyond the end of p. */ ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); if (p_size <= ret && maxlen != ret) fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ, p_size, ret + 1, ret); return ret; } /* * Defined after fortified strnlen to reuse it. However, it must still be * possible for strlen() to be used on compile-time strings for use in * static initializers (i.e. as a constant expression). */ /** * strlen - Return count of characters in a NUL-terminated string * * @p: pointer to NUL-terminated string to count. * * Do not use this function unless the string length is known at * compile-time. When @p is unterminated, this function may crash * or return unexpected counts that could lead to memory content * exposures. Prefer strnlen(). * * Returns number of characters in @p (NOT including the final NUL). * */ #define strlen(p) \ __builtin_choose_expr(__is_constexpr(__builtin_strlen(p)), \ __builtin_strlen(p), __fortify_strlen(p)) __FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1) __kernel_size_t __fortify_strlen(const char * const POS p) { const size_t p_size = __member_size(p); __kernel_size_t ret; /* Give up if we don't know how large p is. */ if (p_size == SIZE_MAX) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ, p_size, ret + 1, ret); return ret; } /* Defined after fortified strnlen() to reuse it. */ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(sized_strscpy); __FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const POS q, size_t size) { /* Use string size rather than possible enclosing struct size. */ const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t len; /* If we cannot get size of p and q default to call strscpy. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strscpy(p, q, size); /* * If size can be known at compile time and is greater than * p_size, generate a compile time write overflow error. */ if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Short-circuit for compile-time known-safe lengths. */ if (__compiletime_lessthan(p_size, SIZE_MAX)) { len = __compiletime_strlen(q); if (len < SIZE_MAX && __compiletime_lessthan(len, size)) { __underlying_memcpy(p, q, len + 1); return len; } } /* * This call protects from read overflow, because len will default to q * length if it smaller than size. */ len = strnlen(q, size); /* * If len equals size, we will copy only size bytes which leads to * -E2BIG being returned. * Otherwise we will copy len + 1 because of the final '\O'. */ len = len == size ? size : len + 1; /* * Generate a runtime write overflow error if len is greater than * p_size. */ if (p_size < len) fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE, p_size, len, -E2BIG); /* * We can now safely call vanilla strscpy because we are protected from: * 1. Read overflow thanks to call to strnlen(). * 2. Write overflow thanks to above ifs. */ return __real_strscpy(p, q, len); } /* Defined after fortified strlen() to reuse it. */ extern size_t __real_strlcat(char *p, const char *q, size_t avail) __RENAME(strlcat); /** * strlcat - Append a string to an existing string * * @p: pointer to %NUL-terminated string to append to * @q: pointer to %NUL-terminated string to append from * @avail: Maximum bytes available in @p * * Appends %NUL-terminated string @q after the %NUL-terminated * string at @p, but will not write beyond @avail bytes total, * potentially truncating the copy from @q. @p will stay * %NUL-terminated only if a %NUL already existed within * the @avail bytes of @p. If so, the resulting number of * bytes copied from @q will be at most "@avail - strlen(@p) - 1". * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the sizes * of @p and @q are known to the compiler. Prefer building the * string with formatting, via scnprintf(), seq_buf, or similar. * * Returns total bytes that _would_ have been contained by @p * regardless of truncation, similar to snprintf(). If return * value is >= @avail, the string has been truncated. * */ __FORTIFY_INLINE size_t strlcat(char * const POS p, const char * const POS q, size_t avail) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t p_len, copy_len; size_t actual, wanted; /* Give up immediately if both buffer sizes are unknown. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strlcat(p, q, avail); p_len = strnlen(p, avail); copy_len = strlen(q); wanted = actual = p_len + copy_len; /* Cannot append any more: report truncation. */ if (avail <= p_len) return wanted; /* Give up if string is already overflowed. */ if (p_size <= p_len) fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ, p_size, p_len + 1, wanted); if (actual >= avail) { copy_len = avail - p_len - 1; actual = p_len + copy_len; } /* Give up if copy will overflow. */ if (p_size <= actual) fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE, p_size, actual + 1, wanted); __underlying_memcpy(p + p_len, q, copy_len); p[actual] = '\0'; return wanted; } /* Defined after fortified strlcat() to reuse it. */ /** * strcat - Append a string to an existing string * * @p: pointer to NUL-terminated string to append to * @q: pointer to NUL-terminated source string to append from * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the * destination buffer size is known to the compiler. Prefer * building the string with formatting, via scnprintf() or similar. * At the very least, use strncat(). * * Returns @p. * */ __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2) char *strcat(char * const POS p, const char *q) { const size_t p_size = __member_size(p); const size_t wanted = strlcat(p, q, p_size); if (p_size <= wanted) fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE, p_size, wanted + 1, p); return p; } /** * strncat - Append a string to an existing string * * @p: pointer to NUL-terminated string to append to * @q: pointer to source string to append from * @count: Maximum bytes to read from @q * * Appends at most @count bytes from @q (stopping at the first * NUL byte) after the NUL-terminated string at @p. @p will be * NUL-terminated. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the sizes * of @p and @q are known to the compiler. Prefer building the * string with formatting, via scnprintf() or similar. * * Returns @p. * */ /* Defined after fortified strlen() and strnlen() to reuse them. */ __FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3) char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t p_len, copy_len, total; if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); total = p_len + copy_len + 1; if (p_size < total) fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE, p_size, total, p); __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } __FORTIFY_INLINE bool fortify_memset_chk(__kernel_size_t size, const size_t p_size, const size_t p_size_field) { if (__builtin_constant_p(size)) { /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ if (__compiletime_lessthan(p_size_field, p_size) && __compiletime_lessthan(p_size, size)) __write_overflow(); /* Warn when write size is larger than dest field. */ if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); } /* * At this point, length argument may not be a constant expression, * so run-time bounds checking can be done where buffer sizes are * known. (This is not an "else" because the above checks may only * be compile-time warnings, and we want to still warn for run-time * overflows.) */ /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE, p_size, size, true); return false; } #define __fortify_memset_chk(p, c, size, p_size, p_size_field) ({ \ size_t __fortify_size = (size_t)(size); \ fortify_memset_chk(__fortify_size, p_size, p_size_field), \ __underlying_memset(p, c, __fortify_size); \ }) /* * __struct_size() vs __member_size() must be captured here to avoid * evaluating argument side-effects further into the macro layers. */ #ifndef CONFIG_KMSAN #define memset(p, c, s) __fortify_memset_chk(p, c, s, \ __struct_size(p), __member_size(p)) #endif /* * To make sure the compiler can enforce protection against buffer overflows, * memcpy(), memmove(), and memset() must not be used beyond individual * struct members. If you need to copy across multiple members, please use * struct_group() to create a named mirror of an anonymous struct union. * (e.g. see struct sk_buff.) Read overflow checking is currently only * done when a write overflow is also present, or when building with W=1. * * Mitigation coverage matrix * Bounds checking at: * +-------+-------+-------+-------+ * | Compile time | Run time | * memcpy() argument sizes: | write | read | write | read | * dest source length +-------+-------+-------+-------+ * memcpy(known, known, constant) | y | y | n/a | n/a | * memcpy(known, unknown, constant) | y | n | n/a | V | * memcpy(known, known, dynamic) | n | n | B | B | * memcpy(known, unknown, dynamic) | n | n | B | V | * memcpy(unknown, known, constant) | n | y | V | n/a | * memcpy(unknown, unknown, constant) | n | n | V | V | * memcpy(unknown, known, dynamic) | n | n | V | B | * memcpy(unknown, unknown, dynamic) | n | n | V | V | * +-------+-------+-------+-------+ * * y = perform deterministic compile-time bounds checking * n = cannot perform deterministic compile-time bounds checking * n/a = no run-time bounds checking needed since compile-time deterministic * B = can perform run-time bounds checking (currently unimplemented) * V = vulnerable to run-time overflow (will need refactoring to solve) * */ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t p_size, const size_t q_size, const size_t p_size_field, const size_t q_size_field, const u8 func) { if (__builtin_constant_p(size)) { /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ if (__compiletime_lessthan(p_size_field, p_size) && __compiletime_lessthan(p_size, size)) __write_overflow(); if (__compiletime_lessthan(q_size_field, q_size) && __compiletime_lessthan(q_size, size)) __read_overflow2(); /* Warn when write size argument larger than dest field. */ if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); /* * Warn for source field over-read when building with W=1 * or when an over-write happened, so both can be fixed at * the same time. */ if ((IS_ENABLED(KBUILD_EXTRA_WARN1) || __compiletime_lessthan(p_size_field, size)) && __compiletime_lessthan(q_size_field, size)) __read_overflow2_field(q_size_field, size); } /* * At this point, length argument may not be a constant expression, * so run-time bounds checking can be done where buffer sizes are * known. (This is not an "else" because the above checks may only * be compile-time warnings, and we want to still warn for run-time * overflows.) */ /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) fortify_panic(func, FORTIFY_WRITE, p_size, size, true); else if (q_size != SIZE_MAX && q_size < size) fortify_panic(func, FORTIFY_READ, q_size, size, true); /* * Warn when writing beyond destination field size. * * Note the implementation of __builtin_*object_size() behaves * like sizeof() when not directly referencing a flexible * array member, which means there will be many bounds checks * that will appear at run-time, without a way for them to be * detected at compile-time (as can be done when the destination * is specifically the flexible array member). * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832 */ if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) return true; return false; } /* * To work around what seems to be an optimizer bug, the macro arguments * need to have const copies or the values end up changed by the time they * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture * __bos() results in const temp vars") for more details. */ #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ const size_t __p_size = (p_size); \ const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ /* Keep a mutable version of the size for the final copy. */ \ size_t __copy_size = __fortify_size; \ fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, FORTIFY_FUNC_ ##op), \ #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \ __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ /* Hide only the run-time size from value range tracking to */ \ /* silence compile-time false positive bounds warnings. */ \ if (!__builtin_constant_p(__copy_size)) \ OPTIMIZER_HIDE_VAR(__copy_size); \ __underlying_##op(p, q, __copy_size); \ }) /* * Notes about compile-time buffer size detection: * * With these types... * * struct middle { * u16 a; * u8 middle_buf[16]; * int b; * }; * struct end { * u16 a; * u8 end_buf[16]; * }; * struct flex { * int a; * u8 flex_buf[]; * }; * * void func(TYPE *ptr) { ... } * * Cases where destination size cannot be currently detected: * - the size of ptr's object (seemingly by design, gcc & clang fail): * __builtin_object_size(ptr, 1) == SIZE_MAX * - the size of flexible arrays in ptr's obj (by design, dynamic size): * __builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX * - the size of ANY array at the end of ptr's obj (gcc and clang bug): * __builtin_object_size(ptr->end_buf, 1) == SIZE_MAX * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836 * * Cases where destination size is currently detected: * - the size of non-array members within ptr's object: * __builtin_object_size(ptr->a, 1) == 2 * - the size of non-flexible-array in the middle of ptr's obj: * __builtin_object_size(ptr->middle_buf, 1) == 16 * */ /* * __struct_size() vs __member_size() must be captured here to avoid * evaluating argument side-effects further into the macro layers. */ #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ __struct_size(p), __struct_size(q), \ __member_size(p), __member_size(q), \ memcpy) #define memmove(p, q, s) __fortify_memcpy_chk(p, q, s, \ __struct_size(p), __struct_size(q), \ __member_size(p), __member_size(q), \ memmove) extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ, p_size, size, NULL); return __real_memscan(p, c, size); } __FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3) int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size) { const size_t p_size = __struct_size(p); const size_t q_size = __struct_size(q); if (__builtin_constant_p(size)) { if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (__compiletime_lessthan(q_size, size)) __read_overflow2(); } if (p_size < size) fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, p_size, size, INT_MIN); else if (q_size < size) fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, q_size, size, INT_MIN); return __underlying_memcmp(p, q, size); } __FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3) void *memchr(const void * const POS0 p, int c, __kernel_size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ, p_size, size, NULL); return __underlying_memchr(p, c, size); } void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ, p_size, size, NULL); return __real_memchr_inv(p, c, size); } extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof) __realloc_size(2); __FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size, __real_kmemdup(p, 0, gfp)); return __real_kmemdup(p, size, gfp); } #define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) /** * strcpy - Copy a string into another string buffer * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * * Do not use this function. While FORTIFY_SOURCE tries to avoid * overflows, this is only possible when the sizes of @q and @p are * known to the compiler. Prefer strscpy(), though note its different * return values for detecting truncation. * * Returns @p. * */ /* Defined after fortified strlen to reuse it. */ __FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2) char *strcpy(char * const POS p, const char * const POS q) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t size; /* If neither buffer size is known, immediately give up. */ if (__builtin_constant_p(p_size) && __builtin_constant_p(q_size) && p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strcpy(p, q); size = strlen(q) + 1; /* Compile-time check for const size overflow. */ if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Run-time check for dynamic size overflow. */ if (p_size < size) fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE, p_size, size, p); __underlying_memcpy(p, q, size); return p; } /* Don't use these outside the FORITFY_SOURCE implementation */ #undef __underlying_memchr #undef __underlying_memcmp #undef __underlying_strcat #undef __underlying_strcpy #undef __underlying_strlen #undef __underlying_strncat #undef __underlying_strncpy #undef POS #undef POS0 #endif /* _LINUX_FORTIFY_STRING_H_ */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ELF_H #define _ASM_X86_ELF_H /* * ELF register definitions.. */ #include <linux/thread_info.h> #include <asm/ia32.h> #include <asm/ptrace.h> #include <asm/user.h> #include <asm/auxvec.h> #include <asm/fsgsbase.h> typedef unsigned long elf_greg_t; #define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t)) typedef elf_greg_t elf_gregset_t[ELF_NGREG]; typedef struct user_i387_struct elf_fpregset_t; #ifdef __i386__ #define R_386_NONE 0 #define R_386_32 1 #define R_386_PC32 2 #define R_386_GOT32 3 #define R_386_PLT32 4 #define R_386_COPY 5 #define R_386_GLOB_DAT 6 #define R_386_JMP_SLOT 7 #define R_386_RELATIVE 8 #define R_386_GOTOFF 9 #define R_386_GOTPC 10 #define R_386_NUM 11 /* * These are used to set parameters in the core dumps. */ #define ELF_CLASS ELFCLASS32 #define ELF_DATA ELFDATA2LSB #define ELF_ARCH EM_386 #else /* x86-64 relocation types */ #define R_X86_64_NONE 0 /* No reloc */ #define R_X86_64_64 1 /* Direct 64 bit */ #define R_X86_64_PC32 2 /* PC relative 32 bit signed */ #define R_X86_64_GOT32 3 /* 32 bit GOT entry */ #define R_X86_64_PLT32 4 /* 32 bit PLT address */ #define R_X86_64_COPY 5 /* Copy symbol at runtime */ #define R_X86_64_GLOB_DAT 6 /* Create GOT entry */ #define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */ #define R_X86_64_RELATIVE 8 /* Adjust by program base */ #define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative offset to GOT */ #define R_X86_64_GOTPCRELX 41 #define R_X86_64_REX_GOTPCRELX 42 #define R_X86_64_32 10 /* Direct 32 bit zero extended */ #define R_X86_64_32S 11 /* Direct 32 bit sign extended */ #define R_X86_64_16 12 /* Direct 16 bit zero extended */ #define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */ #define R_X86_64_8 14 /* Direct 8 bit sign extended */ #define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */ #define R_X86_64_PC64 24 /* Place relative 64-bit signed */ /* * These are used to set parameters in the core dumps. */ #define ELF_CLASS ELFCLASS64 #define ELF_DATA ELFDATA2LSB #define ELF_ARCH EM_X86_64 #endif #include <asm/vdso.h> extern unsigned int vdso64_enabled; extern unsigned int vdso32_enabled; /* * This is used to ensure we don't load something for the wrong architecture. */ #define elf_check_arch_ia32(x) \ (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) #include <asm/processor.h> #ifdef CONFIG_X86_32 #include <asm/desc.h> #define elf_check_arch(x) elf_check_arch_ia32(x) /* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx contains a pointer to a function which might be registered using `atexit'. This provides a mean for the dynamic linker to call DT_FINI functions for shared libraries that have been loaded before the code runs. A value of 0 tells we have no such handler. We might as well make sure everything else is cleared too (except for %esp), just to make things more deterministic. */ #define ELF_PLAT_INIT(_r, load_addr) \ do { \ _r->bx = 0; _r->cx = 0; _r->dx = 0; \ _r->si = 0; _r->di = 0; _r->bp = 0; \ _r->ax = 0; \ } while (0) /* * regs is struct pt_regs, pr_reg is elf_gregset_t (which is * now struct_user_regs, they are different) */ #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ pr_reg[0] = regs->bx; \ pr_reg[1] = regs->cx; \ pr_reg[2] = regs->dx; \ pr_reg[3] = regs->si; \ pr_reg[4] = regs->di; \ pr_reg[5] = regs->bp; \ pr_reg[6] = regs->ax; \ pr_reg[7] = regs->ds; \ pr_reg[8] = regs->es; \ pr_reg[9] = regs->fs; \ savesegment(gs, pr_reg[10]); \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ pr_reg[13] = regs->cs; \ pr_reg[14] = regs->flags; \ pr_reg[15] = regs->sp; \ pr_reg[16] = regs->ss; \ } while (0); #define ELF_PLATFORM (utsname()->machine) #define set_personality_64bit() do { } while (0) #else /* CONFIG_X86_32 */ /* * This is used to ensure we don't load something for the wrong architecture. */ #define elf_check_arch(x) \ ((x)->e_machine == EM_X86_64) #define compat_elf_check_arch(x) \ ((elf_check_arch_ia32(x) && ia32_enabled_verbose()) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { /* ax gets execve's return value. */ /*regs->ax = */ regs->bx = regs->cx = regs->dx = 0; regs->si = regs->di = regs->bp = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; t->fsbase = t->gsbase = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; } #define ELF_PLAT_INIT(_r, load_addr) \ elf_common_init(¤t->thread, _r, 0) #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32); #define COMPAT_START_THREAD(ex, regs, new_ip, new_sp) \ compat_start_thread(regs, new_ip, new_sp, ex->e_machine == EM_X86_64) void set_personality_ia32(bool); #define COMPAT_SET_PERSONALITY(ex) \ set_personality_ia32((ex).e_machine == EM_X86_64) #define COMPAT_ELF_PLATFORM ("i686") /* * regs is struct pt_regs, pr_reg is elf_gregset_t (which is * now struct_user_regs, they are different). Assumes current is the process * getting dumped. */ #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ (pr_reg)[2] = (regs)->r13; \ (pr_reg)[3] = (regs)->r12; \ (pr_reg)[4] = (regs)->bp; \ (pr_reg)[5] = (regs)->bx; \ (pr_reg)[6] = (regs)->r11; \ (pr_reg)[7] = (regs)->r10; \ (pr_reg)[8] = (regs)->r9; \ (pr_reg)[9] = (regs)->r8; \ (pr_reg)[10] = (regs)->ax; \ (pr_reg)[11] = (regs)->cx; \ (pr_reg)[12] = (regs)->dx; \ (pr_reg)[13] = (regs)->si; \ (pr_reg)[14] = (regs)->di; \ (pr_reg)[15] = (regs)->orig_ax; \ (pr_reg)[16] = (regs)->ip; \ (pr_reg)[17] = (regs)->cs; \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ (pr_reg)[21] = x86_fsbase_read_cpu(); \ (pr_reg)[22] = x86_gsbase_read_cpu_inactive(); \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \ } while (0); /* I'm not sure if we can use '-' here */ #define ELF_PLATFORM ("x86_64") extern void set_personality_64bit(void); extern int force_personality32; #endif /* !CONFIG_X86_32 */ #define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE 4096 /* * This is the base location for PIE (ET_DYN with INTERP) loads. On * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ (DEFAULT_MAP_WINDOW / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, but it's not easy, and we've already done it here. */ #define ELF_HWCAP (boot_cpu_data.x86_capability[CPUID_1_EDX]) extern u32 elf_hwcap2; /* * HWCAP2 supplies mask with kernel enabled CPU features, so that * the application can discover that it can safely use them. * The bits are defined in uapi/asm/hwcap2.h. */ #define ELF_HWCAP2 (elf_hwcap2) /* This yields a string that ld.so will use to load implementation specific libraries for optimization. This is more specific in intent than poking at uname or /proc/cpuinfo. For the moment, we have only optimizations for the Intel generations, but that could change... */ #define SET_PERSONALITY(ex) set_personality_64bit() /* * An executable for which elf_read_implies_exec() returns TRUE will * have the READ_IMPLIES_EXEC personality flag set automatically. * * The decision process for determining the results are: * * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 | * ELF: | | | | * ---------------------|------------|------------------|----------------| * missing PT_GNU_STACK | exec-all | exec-all | exec-none | * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack | * PT_GNU_STACK == RW | exec-none | exec-none | exec-none | * * exec-all : all PROT_READ user mappings are executable, except when * backed by files on a noexec-filesystem. * exec-none : only PROT_EXEC user mappings are executable. * exec-stack: only the stack and PROT_EXEC user mappings are executable. * * *this column has no architectural effect: NX markings are ignored by * hardware, but may have behavioral effects when "wants X" collides with * "cannot be X" constraints in memory permission flags, as in * https://lkml.kernel.org/r/20190418055759.GA3155@mellanox.com * */ #define elf_read_implies_exec(ex, executable_stack) \ (mmap_is_ia32() && executable_stack == EXSTACK_DEFAULT) struct task_struct; #define ARCH_DLINFO_IA32 \ do { \ if (VDSO_CURRENT_BASE) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \ } while (0) /* * True on X86_32 or when emulating IA32 on X86_64 */ static inline int mmap_is_ia32(void) { return IS_ENABLED(CONFIG_X86_32) || (IS_ENABLED(CONFIG_COMPAT) && test_thread_flag(TIF_ADDR32)); } extern unsigned long task_size_32bit(void); extern unsigned long task_size_64bit(int full_addr_space); extern unsigned long get_mmap_base(int is_legacy); extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); extern unsigned long get_sigframe_size(void); #ifdef CONFIG_X86_32 #define __STACK_RND_MASK(is32bit) (0x7ff) #define STACK_RND_MASK (0x7ff) #define ARCH_DLINFO ARCH_DLINFO_IA32 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #else /* CONFIG_X86_32 */ /* 1GB for 64bit, 8MB for 32bit */ #define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) #define ARCH_DLINFO \ do { \ if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long __force)current->mm->context.vdso); \ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \ } while (0) /* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */ #define ARCH_DLINFO_X32 \ do { \ if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long __force)current->mm->context.vdso); \ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \ } while (0) #define AT_SYSINFO 32 #define COMPAT_ARCH_DLINFO \ if (exec->e_machine == EM_X86_64) \ ARCH_DLINFO_X32; \ else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \ ARCH_DLINFO_IA32 #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) #endif /* !CONFIG_X86_32 */ #define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso) #define VDSO_ENTRY \ ((unsigned long)current->mm->context.vdso + \ vdso_image_32.sym___kernel_vsyscall) struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, bool x32); #define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \ compat_arch_setup_additional_pages(bprm, interpreter, \ (ex->e_machine == EM_X86_64)) extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs); /* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), ALIGN_VA_64 = BIT(1), }; struct va_alignment { int flags; unsigned long mask; unsigned long bits; } ____cacheline_aligned; extern struct va_alignment va_align; #endif /* _ASM_X86_ELF_H */ |
| 1748 1749 1751 1750 1748 1751 1748 1933 1933 32 33 33 21 21 21 21 2236 30 30 1 30 30 30 30 30 1 30 2233 21 21 21 21 21 21 21 21 21 21 21 21 32 32 32 32 32 32 20 29 30 20 20 20 20 20 20 20 20 20 29 30 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 | // SPDX-License-Identifier: GPL-2.0 /* * Devices PM QoS constraints management * * Copyright (C) 2011 Texas Instruments, Inc. * * This module exposes the interface to kernel space for specifying * per-device PM QoS dependencies. It provides infrastructure for registration * of: * * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. * Watchers register to keep track of the current QoS needs of the system. * Watchers can register a per-device notification callback using the * dev_pm_qos_*_notifier API. The notification chain data is stored in the * per-device constraint data struct. * * Note about the per-device constraint data struct allocation: * . The per-device constraints data struct ptr is stored into the device * dev_pm_info. * . To minimize the data usage by the per-device constraints, the data struct * is only allocated at the first call to dev_pm_qos_add_request. * . The data is later free'd when the device is removed from the system. * . A global mutex protects the constraints users from the data being * allocated and free'd. */ #include <linux/pm_qos.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/err.h> #include <trace/events/power.h> #include "power.h" static DEFINE_MUTEX(dev_pm_qos_mtx); static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx); /** * __dev_pm_qos_flags - Check PM QoS flags for a given device. * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. * * This routine must be called with dev->power.lock held. */ enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { struct dev_pm_qos *qos = dev->power.qos; struct pm_qos_flags *pqf; s32 val; lockdep_assert_held(&dev->power.lock); if (IS_ERR_OR_NULL(qos)) return PM_QOS_FLAGS_UNDEFINED; pqf = &qos->flags; if (list_empty(&pqf->list)) return PM_QOS_FLAGS_UNDEFINED; val = pqf->effective_flags & mask; if (val) return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME; return PM_QOS_FLAGS_NONE; } /** * dev_pm_qos_flags - Check PM QoS flags for a given device (locked). * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. */ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { unsigned long irqflags; enum pm_qos_flags_status ret; spin_lock_irqsave(&dev->power.lock, irqflags); ret = __dev_pm_qos_flags(dev, mask); spin_unlock_irqrestore(&dev->power.lock, irqflags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_flags); /** * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device. * @dev: Device to get the PM QoS constraint value for. * * This routine must be called with dev->power.lock held. */ s32 __dev_pm_qos_resume_latency(struct device *dev) { lockdep_assert_held(&dev->power.lock); return dev_pm_qos_raw_resume_latency(dev); } /** * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked). * @dev: Device to get the PM QoS constraint value for. * @type: QoS request type. */ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos *qos = dev->power.qos; unsigned long flags; s32 ret; spin_lock_irqsave(&dev->power.lock, flags); switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); break; default: WARN_ON(1); ret = 0; } spin_unlock_irqrestore(&dev->power.lock, flags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_read_value); /** * apply_constraint - Add/modify/remove device PM QoS request. * @req: Constraint request to apply * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * Internal function to update the constraints list using the PM QoS core * code and if needed call the per-device callbacks. */ static int apply_constraint(struct dev_pm_qos_request *req, enum pm_qos_req_action action, s32 value) { struct dev_pm_qos *qos = req->dev->power.qos; int ret; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0)) value = 0; ret = pm_qos_update_target(&qos->resume_latency, &req->data.pnode, action, value); break; case DEV_PM_QOS_LATENCY_TOLERANCE: ret = pm_qos_update_target(&qos->latency_tolerance, &req->data.pnode, action, value); if (ret) { value = pm_qos_read_value(&qos->latency_tolerance); req->dev->power.set_latency_tolerance(req->dev, value); } break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_apply(&req->data.freq, action, value); break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); break; default: ret = -EINVAL; } return ret; } /* * dev_pm_qos_constraints_allocate * @dev: device to allocate data for * * Called at the first call to add_request, for constraint data allocation * Must be called with the dev_pm_qos_mtx mutex held */ static int dev_pm_qos_constraints_allocate(struct device *dev) { struct dev_pm_qos *qos; struct pm_qos_constraints *c; struct blocking_notifier_head *n; qos = kzalloc(sizeof(*qos), GFP_KERNEL); if (!qos) return -ENOMEM; n = kcalloc(3, sizeof(*n), GFP_KERNEL); if (!n) { kfree(qos); return -ENOMEM; } c = &qos->resume_latency; plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; BLOCKING_INIT_NOTIFIER_HEAD(n); c = &qos->latency_tolerance; plist_head_init(&c->list); c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; freq_constraints_init(&qos->freq); INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); dev->power.qos = qos; spin_unlock_irq(&dev->power.lock); return 0; } static void __dev_pm_qos_hide_latency_limit(struct device *dev); static void __dev_pm_qos_hide_flags(struct device *dev); /** * dev_pm_qos_constraints_destroy * @dev: target device * * Called from the device PM subsystem on device removal under device_pm_lock(). */ void dev_pm_qos_constraints_destroy(struct device *dev) { struct dev_pm_qos *qos; struct dev_pm_qos_request *req, *tmp; struct pm_qos_constraints *c; struct pm_qos_flags *f; mutex_lock(&dev_pm_qos_sysfs_mtx); /* * If the device's PM QoS resume latency limit or PM QoS flags have been * exposed to user space, they have to be hidden at this point. */ pm_qos_sysfs_remove_resume_latency(dev); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); __dev_pm_qos_hide_flags(dev); qos = dev->power.qos; if (!qos) goto out; /* Flush the constraints lists for the device. */ c = &qos->resume_latency; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { /* * Update constraints list and call the notification * callbacks if needed */ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->latency_tolerance; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.min_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.max_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } spin_lock_irq(&dev->power.lock); dev->power.qos = ERR_PTR(-ENODEV); spin_unlock_irq(&dev->power.lock); kfree(qos->resume_latency.notifiers); kfree(qos); out: mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } static bool dev_pm_qos_invalid_req_type(struct device *dev, enum dev_pm_qos_req_type type) { return type == DEV_PM_QOS_LATENCY_TOLERANCE && !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret = 0; if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), "%s() called for already added request\n", __func__)) return -EINVAL; if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); if (ret) return ret; req->dev = dev; req->type = type; if (req->type == DEV_PM_QOS_MIN_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MIN, value); else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MAX, value); else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); return ret; } /** * dev_pm_qos_add_request - inserts new qos request into the list * @dev: target device for the constraint * @req: pointer to a preallocated handle * @type: type of the request * @value: defines the qos request * * This function inserts a new entry in the device constraints list of * requested qos performance characteristics. It recomputes the aggregate * QoS expectations of parameters and initializes the dev_pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory * to allocate for data structures, -ENODEV if the device has just been removed * from the system. * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_add_request(dev, req, type, value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_request); /** * __dev_pm_qos_update_request - Modify an existing device PM QoS request. * @req : PM QoS request to modify. * @new_value: New value to request. */ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { s32 curr_value; int ret = 0; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: curr_value = req->data.freq.pnode.prio; break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; default: return -EINVAL; } trace_dev_pm_qos_update_request(dev_name(req->dev), req->type, new_value); if (curr_value != new_value) ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value); return ret; } /** * dev_pm_qos_update_request - modifies an existing qos request * @req : handle to list element holding a dev_pm_qos request to use * @new_value: defines the qos request * * Updates an existing dev PM qos request along with updating the * target value. * * Attempts are made to make this code callable on hot code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_update_request(req, new_value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_request); static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type, PM_QOS_DEFAULT_VALUE); ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); return ret; } /** * dev_pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * * Will remove pm qos request from the list of constraints and * recompute the current target value. Call this on slow code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_remove_request(req); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request); /** * dev_pm_qos_add_notifier - sets notification entry for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block managed by caller. * @type: request type. * * Will register the notifier into a notification chain that gets called * upon changes to the target value for the device. * * If the device's constraints object doesn't exist when this routine is called, * it will be created (or error code will be returned if that fails). */ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); if (ret) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier); /** * dev_pm_qos_remove_notifier - deletes notification for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block to be removed. * @type: request type. * * Will remove the notifier from the notification chain that gets called * upon changes to the target value. */ int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); /* Silently return if the constraints object is not present. */ if (IS_ERR_OR_NULL(dev->power.qos)) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier); /** * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor. * @dev: Device whose ancestor to add the request for. * @req: Pointer to the preallocated handle. * @type: Type of the request. * @value: Constraint latency value. */ int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { struct device *ancestor = dev->parent; int ret = -ENODEV; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: while (ancestor && !ancestor->power.ignore_children) ancestor = ancestor->parent; break; case DEV_PM_QOS_LATENCY_TOLERANCE: while (ancestor && !ancestor->power.set_latency_tolerance) ancestor = ancestor->parent; break; default: ancestor = NULL; } if (ancestor) ret = dev_pm_qos_add_request(ancestor, req, type, value); if (ret < 0) req->dev = NULL; return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request); static void __dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos_request *req = NULL; switch(type) { case DEV_PM_QOS_RESUME_LATENCY: req = dev->power.qos->resume_latency_req; dev->power.qos->resume_latency_req = NULL; break; case DEV_PM_QOS_LATENCY_TOLERANCE: req = dev->power.qos->latency_tolerance_req; dev->power.qos->latency_tolerance_req = NULL; break; case DEV_PM_QOS_FLAGS: req = dev->power.qos->flags_req; dev->power.qos->flags_req = NULL; break; default: WARN_ON(1); return; } __dev_pm_qos_remove_request(req); kfree(req); } static void dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_drop_user_request(dev, type); mutex_unlock(&dev_pm_qos_mtx); } /** * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space. * @dev: Device whose PM QoS latency limit is to be exposed to user space. * @value: Initial value of the latency limit. */ int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev) || value < 0) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value); if (ret < 0) { kfree(req); return ret; } mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->resume_latency_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->resume_latency_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_resume_latency(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit); static void __dev_pm_qos_hide_latency_limit(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); } /** * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space. * @dev: Device whose PM QoS latency limit is to be hidden from user space. */ void dev_pm_qos_hide_latency_limit(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_resume_latency(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit); /** * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space. * @dev: Device whose PM QoS flags are to be exposed to user space. * @val: Initial values of the flags. */ int dev_pm_qos_expose_flags(struct device *dev, s32 val) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev)) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val); if (ret < 0) { kfree(req); return ret; } pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->flags_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->flags_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_flags(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags); static void __dev_pm_qos_hide_flags(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); } /** * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space. * @dev: Device whose PM QoS flags are to be hidden from user space. */ void dev_pm_qos_hide_flags(struct device *dev) { pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_flags(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags); /** * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space. * @dev: Device to update the PM QoS flags request for. * @mask: Flags to set/clear. * @set: Whether to set or clear the flags (true means set). */ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set) { s32 value; int ret; pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) { ret = -EINVAL; goto out; } value = dev_pm_qos_requested_flags(dev); if (set) value |= mask; else value &= ~mask; ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value); out: mutex_unlock(&dev_pm_qos_mtx); pm_runtime_put(dev); return ret; } /** * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance. * @dev: Device to obtain the user space latency tolerance for. */ s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { s32 ret; mutex_lock(&dev_pm_qos_mtx); ret = IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req ? PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT : dev->power.qos->latency_tolerance_req->data.pnode.prio; mutex_unlock(&dev_pm_qos_mtx); return ret; } /** * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance. * @dev: Device to update the user space latency tolerance for. * @val: New user space latency tolerance for @dev (negative values disable). */ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { int ret; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req) { struct dev_pm_qos_request *req; if (val < 0) { if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) ret = 0; else ret = -EINVAL; goto out; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { ret = -ENOMEM; goto out; } ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val); if (ret < 0) { kfree(req); goto out; } dev->power.qos->latency_tolerance_req = req; } else { if (val < 0) { __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE); ret = 0; } else { ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val); } } out: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace * @dev: Device whose latency tolerance to expose */ int dev_pm_qos_expose_latency_tolerance(struct device *dev) { int ret; if (!dev->power.set_latency_tolerance) return -EINVAL; mutex_lock(&dev_pm_qos_sysfs_mtx); ret = pm_qos_sysfs_add_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance); /** * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace * @dev: Device whose latency tolerance to hide */ void dev_pm_qos_hide_latency_tolerance(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); /* Remove the request from user space now */ pm_runtime_get_sync(dev); dev_pm_qos_update_user_latency_tolerance(dev, PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance); |
| 105 104 105 98 98 97 97 98 98 97 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 | // SPDX-License-Identifier: GPL-2.0-or-later /* * acpi_bus.c - ACPI Bus Driver ($Revision: 80 $) * * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/sched.h> #include <linux/pm.h> #include <linux/device.h> #include <linux/proc_fs.h> #include <linux/acpi.h> #include <linux/slab.h> #include <linux/regulator/machine.h> #include <linux/workqueue.h> #include <linux/reboot.h> #include <linux/delay.h> #ifdef CONFIG_X86 #include <asm/mpspec.h> #include <linux/dmi.h> #endif #include <linux/acpi_viot.h> #include <linux/pci.h> #include <acpi/apei.h> #include <linux/suspend.h> #include <linux/prmt.h> #include "internal.h" struct acpi_device *acpi_root; struct proc_dir_entry *acpi_root_dir; EXPORT_SYMBOL(acpi_root_dir); #ifdef CONFIG_X86 #ifdef CONFIG_ACPI_CUSTOM_DSDT static inline int set_copy_dsdt(const struct dmi_system_id *id) { return 0; } #else static int set_copy_dsdt(const struct dmi_system_id *id) { pr_notice("%s detected - force copy of DSDT to local memory\n", id->ident); acpi_gbl_copy_dsdt_locally = 1; return 0; } #endif static const struct dmi_system_id dsdt_dmi_table[] __initconst = { /* * Invoke DSDT corruption work-around on all Toshiba Satellite. * https://bugzilla.kernel.org/show_bug.cgi?id=14679 */ { .callback = set_copy_dsdt, .ident = "TOSHIBA Satellite", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), DMI_MATCH(DMI_PRODUCT_NAME, "Satellite"), }, }, {} }; #endif /* -------------------------------------------------------------------------- Device Management -------------------------------------------------------------------------- */ acpi_status acpi_bus_get_status_handle(acpi_handle handle, unsigned long long *sta) { acpi_status status; status = acpi_evaluate_integer(handle, "_STA", NULL, sta); if (ACPI_SUCCESS(status)) return AE_OK; if (status == AE_NOT_FOUND) { *sta = ACPI_STA_DEVICE_PRESENT | ACPI_STA_DEVICE_ENABLED | ACPI_STA_DEVICE_UI | ACPI_STA_DEVICE_FUNCTIONING; return AE_OK; } return status; } EXPORT_SYMBOL_GPL(acpi_bus_get_status_handle); int acpi_bus_get_status(struct acpi_device *device) { acpi_status status; unsigned long long sta; if (acpi_device_override_status(device, &sta)) { acpi_set_device_status(device, sta); return 0; } /* Battery devices must have their deps met before calling _STA */ if (acpi_device_is_battery(device) && device->dep_unmet) { acpi_set_device_status(device, 0); return 0; } status = acpi_bus_get_status_handle(device->handle, &sta); if (ACPI_FAILURE(status)) return -ENODEV; if (!device->status.present && device->status.enabled) { pr_info(FW_BUG "Device [%s] status [%08x]: not present and enabled\n", device->pnp.bus_id, (u32)sta); device->status.enabled = 0; /* * The status is clearly invalid, so clear the functional bit as * well to avoid attempting to use the device. */ device->status.functional = 0; } acpi_set_device_status(device, sta); if (device->status.functional && !device->status.present) { pr_debug("Device [%s] status [%08x]: functional but not present\n", device->pnp.bus_id, (u32)sta); } pr_debug("Device [%s] status [%08x]\n", device->pnp.bus_id, (u32)sta); return 0; } EXPORT_SYMBOL(acpi_bus_get_status); void acpi_bus_private_data_handler(acpi_handle handle, void *context) { return; } EXPORT_SYMBOL(acpi_bus_private_data_handler); int acpi_bus_attach_private_data(acpi_handle handle, void *data) { acpi_status status; status = acpi_attach_data(handle, acpi_bus_private_data_handler, data); if (ACPI_FAILURE(status)) { acpi_handle_debug(handle, "Error attaching device data\n"); return -ENODEV; } return 0; } EXPORT_SYMBOL_GPL(acpi_bus_attach_private_data); int acpi_bus_get_private_data(acpi_handle handle, void **data) { acpi_status status; if (!data) return -EINVAL; status = acpi_get_data(handle, acpi_bus_private_data_handler, data); if (ACPI_FAILURE(status)) { acpi_handle_debug(handle, "No context for object\n"); return -ENODEV; } return 0; } EXPORT_SYMBOL_GPL(acpi_bus_get_private_data); void acpi_bus_detach_private_data(acpi_handle handle) { acpi_detach_data(handle, acpi_bus_private_data_handler); } EXPORT_SYMBOL_GPL(acpi_bus_detach_private_data); static void acpi_print_osc_error(acpi_handle handle, struct acpi_osc_context *context, char *error) { int i; acpi_handle_debug(handle, "(%s): %s\n", context->uuid_str, error); pr_debug("_OSC request data:"); for (i = 0; i < context->cap.length; i += sizeof(u32)) pr_debug(" %x", *((u32 *)(context->cap.pointer + i))); pr_debug("\n"); } acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context) { acpi_status status; struct acpi_object_list input; union acpi_object in_params[4]; union acpi_object *out_obj; guid_t guid; u32 errors; struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; if (!context) return AE_ERROR; if (guid_parse(context->uuid_str, &guid)) return AE_ERROR; context->ret.length = ACPI_ALLOCATE_BUFFER; context->ret.pointer = NULL; /* Setting up input parameters */ input.count = 4; input.pointer = in_params; in_params[0].type = ACPI_TYPE_BUFFER; in_params[0].buffer.length = 16; in_params[0].buffer.pointer = (u8 *)&guid; in_params[1].type = ACPI_TYPE_INTEGER; in_params[1].integer.value = context->rev; in_params[2].type = ACPI_TYPE_INTEGER; in_params[2].integer.value = context->cap.length/sizeof(u32); in_params[3].type = ACPI_TYPE_BUFFER; in_params[3].buffer.length = context->cap.length; in_params[3].buffer.pointer = context->cap.pointer; status = acpi_evaluate_object(handle, "_OSC", &input, &output); if (ACPI_FAILURE(status)) return status; if (!output.length) return AE_NULL_OBJECT; out_obj = output.pointer; if (out_obj->type != ACPI_TYPE_BUFFER || out_obj->buffer.length != context->cap.length) { acpi_print_osc_error(handle, context, "_OSC evaluation returned wrong type"); status = AE_TYPE; goto out_kfree; } /* Need to ignore the bit0 in result code */ errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); if (errors) { if (errors & OSC_REQUEST_ERROR) acpi_print_osc_error(handle, context, "_OSC request failed"); if (errors & OSC_INVALID_UUID_ERROR) acpi_print_osc_error(handle, context, "_OSC invalid UUID"); if (errors & OSC_INVALID_REVISION_ERROR) acpi_print_osc_error(handle, context, "_OSC invalid revision"); if (errors & OSC_CAPABILITIES_MASK_ERROR) { if (((u32 *)context->cap.pointer)[OSC_QUERY_DWORD] & OSC_QUERY_ENABLE) goto out_success; status = AE_SUPPORT; goto out_kfree; } status = AE_ERROR; goto out_kfree; } out_success: context->ret.length = out_obj->buffer.length; context->ret.pointer = kmemdup(out_obj->buffer.pointer, context->ret.length, GFP_KERNEL); if (!context->ret.pointer) { status = AE_NO_MEMORY; goto out_kfree; } status = AE_OK; out_kfree: kfree(output.pointer); return status; } EXPORT_SYMBOL(acpi_run_osc); bool osc_sb_apei_support_acked; /* * ACPI 6.0 Section 8.4.4.2 Idle State Coordination * OSPM supports platform coordinated low power idle(LPI) states */ bool osc_pc_lpi_support_confirmed; EXPORT_SYMBOL_GPL(osc_pc_lpi_support_confirmed); /* * ACPI 6.2 Section 6.2.11.2 'Platform-Wide OSPM Capabilities': * Starting with ACPI Specification 6.2, all _CPC registers can be in * PCC, System Memory, System IO, or Functional Fixed Hardware address * spaces. OSPM support for this more flexible register space scheme is * indicated by the “Flexible Address Space for CPPC Registers” _OSC bit. * * Otherwise (cf ACPI 6.1, s8.4.7.1.1.X), _CPC registers must be in: * - PCC or Functional Fixed Hardware address space if defined * - SystemMemory address space (NULL register) if not defined */ bool osc_cpc_flexible_adr_space_confirmed; EXPORT_SYMBOL_GPL(osc_cpc_flexible_adr_space_confirmed); /* * ACPI 6.4 Operating System Capabilities for USB. */ bool osc_sb_native_usb4_support_confirmed; EXPORT_SYMBOL_GPL(osc_sb_native_usb4_support_confirmed); bool osc_sb_cppc2_support_acked; static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48"; static void acpi_bus_osc_negotiate_platform_control(void) { u32 capbuf[2], *capbuf_ret; struct acpi_osc_context context = { .uuid_str = sb_uuid_str, .rev = 1, .cap.length = 8, .cap.pointer = capbuf, }; acpi_handle handle; capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = OSC_SB_PR3_SUPPORT; /* _PR3 is in use */ if (IS_ENABLED(CONFIG_ACPI_PROCESSOR_AGGREGATOR)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PAD_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_PROCESSOR)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PPC_OST_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_THERMAL)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_BATTERY)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_BATTERY_CHARGE_LIMITING_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_OVER_16_PSTATES_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GED_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_IRQ_RESOURCE_SOURCE_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_PRMT)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_FFH)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_FFH_OPR_SUPPORT; #ifdef CONFIG_ARM64 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; #endif #ifdef CONFIG_X86 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; #endif #ifdef CONFIG_ACPI_CPPC_LIB capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT; #endif capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_FLEXIBLE_ADR_SPACE; if (IS_ENABLED(CONFIG_SCHED_MC_PRIO)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_DIVERSE_HIGH_SUPPORT; if (IS_ENABLED(CONFIG_USB4)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_NATIVE_USB4_SUPPORT; if (!ghes_disable) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_APEI_SUPPORT; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) return; if (ACPI_FAILURE(acpi_run_osc(handle, &context))) return; capbuf_ret = context.ret.pointer; if (context.ret.length <= OSC_SUPPORT_DWORD) { kfree(context.ret.pointer); return; } /* * Now run _OSC again with query flag clear and with the caps * supported by both the OS and the platform. */ capbuf[OSC_QUERY_DWORD] = 0; capbuf[OSC_SUPPORT_DWORD] = capbuf_ret[OSC_SUPPORT_DWORD]; kfree(context.ret.pointer); if (ACPI_FAILURE(acpi_run_osc(handle, &context))) return; capbuf_ret = context.ret.pointer; if (context.ret.length > OSC_SUPPORT_DWORD) { #ifdef CONFIG_ACPI_CPPC_LIB osc_sb_cppc2_support_acked = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPCV2_SUPPORT; #endif osc_sb_apei_support_acked = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_APEI_SUPPORT; osc_pc_lpi_support_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_PCLPI_SUPPORT; osc_sb_native_usb4_support_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_NATIVE_USB4_SUPPORT; osc_cpc_flexible_adr_space_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPC_FLEXIBLE_ADR_SPACE; } kfree(context.ret.pointer); } /* * Native control of USB4 capabilities. If any of the tunneling bits is * set it means OS is in control and we use software based connection * manager. */ u32 osc_sb_native_usb4_control; EXPORT_SYMBOL_GPL(osc_sb_native_usb4_control); static void acpi_bus_decode_usb_osc(const char *msg, u32 bits) { pr_info("%s USB3%c DisplayPort%c PCIe%c XDomain%c\n", msg, (bits & OSC_USB_USB3_TUNNELING) ? '+' : '-', (bits & OSC_USB_DP_TUNNELING) ? '+' : '-', (bits & OSC_USB_PCIE_TUNNELING) ? '+' : '-', (bits & OSC_USB_XDOMAIN) ? '+' : '-'); } static u8 sb_usb_uuid_str[] = "23A0D13A-26AB-486C-9C5F-0FFA525A575A"; static void acpi_bus_osc_negotiate_usb_control(void) { u32 capbuf[3], *capbuf_ret; struct acpi_osc_context context = { .uuid_str = sb_usb_uuid_str, .rev = 1, .cap.length = sizeof(capbuf), .cap.pointer = capbuf, }; acpi_handle handle; acpi_status status; u32 control; if (!osc_sb_native_usb4_support_confirmed) return; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) return; control = OSC_USB_USB3_TUNNELING | OSC_USB_DP_TUNNELING | OSC_USB_PCIE_TUNNELING | OSC_USB_XDOMAIN; /* * Run _OSC first with query bit set, trying to get control over * all tunneling. The platform can then clear out bits in the * control dword that it does not want to grant to the OS. */ capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = 0; capbuf[OSC_CONTROL_DWORD] = control; status = acpi_run_osc(handle, &context); if (ACPI_FAILURE(status)) return; if (context.ret.length != sizeof(capbuf)) { pr_info("USB4 _OSC: returned invalid length buffer\n"); goto out_free; } /* * Run _OSC again now with query bit clear and the control dword * matching what the platform granted (which may not have all * the control bits set). */ capbuf_ret = context.ret.pointer; capbuf[OSC_QUERY_DWORD] = 0; capbuf[OSC_CONTROL_DWORD] = capbuf_ret[OSC_CONTROL_DWORD]; kfree(context.ret.pointer); status = acpi_run_osc(handle, &context); if (ACPI_FAILURE(status)) return; if (context.ret.length != sizeof(capbuf)) { pr_info("USB4 _OSC: returned invalid length buffer\n"); goto out_free; } osc_sb_native_usb4_control = control & acpi_osc_ctx_get_pci_control(&context); acpi_bus_decode_usb_osc("USB4 _OSC: OS supports", control); acpi_bus_decode_usb_osc("USB4 _OSC: OS controls", osc_sb_native_usb4_control); out_free: kfree(context.ret.pointer); } /* -------------------------------------------------------------------------- Notification Handling -------------------------------------------------------------------------- */ /** * acpi_bus_notify - Global system-level (0x00-0x7F) notifications handler * @handle: Target ACPI object. * @type: Notification type. * @data: Ignored. * * This only handles notifications related to device hotplug. */ static void acpi_bus_notify(acpi_handle handle, u32 type, void *data) { struct acpi_device *adev; switch (type) { case ACPI_NOTIFY_BUS_CHECK: acpi_handle_debug(handle, "ACPI_NOTIFY_BUS_CHECK event\n"); break; case ACPI_NOTIFY_DEVICE_CHECK: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK event\n"); break; case ACPI_NOTIFY_DEVICE_WAKE: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_WAKE event\n"); return; case ACPI_NOTIFY_EJECT_REQUEST: acpi_handle_debug(handle, "ACPI_NOTIFY_EJECT_REQUEST event\n"); break; case ACPI_NOTIFY_DEVICE_CHECK_LIGHT: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK_LIGHT event\n"); /* TBD: Exactly what does 'light' mean? */ return; case ACPI_NOTIFY_FREQUENCY_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a frequency mismatch\n"); return; case ACPI_NOTIFY_BUS_MODE_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a bus mode mismatch\n"); return; case ACPI_NOTIFY_POWER_FAULT: acpi_handle_err(handle, "Device has suffered a power fault\n"); return; default: acpi_handle_debug(handle, "Unknown event type 0x%x\n", type); return; } adev = acpi_get_acpi_dev(handle); if (adev && ACPI_SUCCESS(acpi_hotplug_schedule(adev, type))) return; acpi_put_acpi_dev(adev); acpi_evaluate_ost(handle, type, ACPI_OST_SC_NON_SPECIFIC_FAILURE, NULL); } static void acpi_notify_device(acpi_handle handle, u32 event, void *data) { struct acpi_device *device = data; struct acpi_driver *acpi_drv = to_acpi_driver(device->dev.driver); acpi_drv->ops.notify(device, event); } static int acpi_device_install_notify_handler(struct acpi_device *device, struct acpi_driver *acpi_drv) { u32 type = acpi_drv->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS ? ACPI_ALL_NOTIFY : ACPI_DEVICE_NOTIFY; acpi_status status; status = acpi_install_notify_handler(device->handle, type, acpi_notify_device, device); if (ACPI_FAILURE(status)) return -EINVAL; return 0; } static void acpi_device_remove_notify_handler(struct acpi_device *device, struct acpi_driver *acpi_drv) { u32 type = acpi_drv->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS ? ACPI_ALL_NOTIFY : ACPI_DEVICE_NOTIFY; acpi_remove_notify_handler(device->handle, type, acpi_notify_device); acpi_os_wait_events_complete(); } int acpi_dev_install_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler, void *context) { acpi_status status; status = acpi_install_notify_handler(adev->handle, handler_type, handler, context); if (ACPI_FAILURE(status)) return -ENODEV; return 0; } EXPORT_SYMBOL_GPL(acpi_dev_install_notify_handler); void acpi_dev_remove_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler) { acpi_remove_notify_handler(adev->handle, handler_type, handler); acpi_os_wait_events_complete(); } EXPORT_SYMBOL_GPL(acpi_dev_remove_notify_handler); /* Handle events targeting \_SB device (at present only graceful shutdown) */ #define ACPI_SB_NOTIFY_SHUTDOWN_REQUEST 0x81 #define ACPI_SB_INDICATE_INTERVAL 10000 static void sb_notify_work(struct work_struct *dummy) { acpi_handle sb_handle; orderly_poweroff(true); /* * After initiating graceful shutdown, the ACPI spec requires OSPM * to evaluate _OST method once every 10seconds to indicate that * the shutdown is in progress */ acpi_get_handle(NULL, "\\_SB", &sb_handle); while (1) { pr_info("Graceful shutdown in progress.\n"); acpi_evaluate_ost(sb_handle, ACPI_OST_EC_OSPM_SHUTDOWN, ACPI_OST_SC_OS_SHUTDOWN_IN_PROGRESS, NULL); msleep(ACPI_SB_INDICATE_INTERVAL); } } static void acpi_sb_notify(acpi_handle handle, u32 event, void *data) { static DECLARE_WORK(acpi_sb_work, sb_notify_work); if (event == ACPI_SB_NOTIFY_SHUTDOWN_REQUEST) { if (!work_busy(&acpi_sb_work)) schedule_work(&acpi_sb_work); } else { pr_warn("event %x is not supported by \\_SB device\n", event); } } static int __init acpi_setup_sb_notify_handler(void) { acpi_handle sb_handle; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &sb_handle))) return -ENXIO; if (ACPI_FAILURE(acpi_install_notify_handler(sb_handle, ACPI_DEVICE_NOTIFY, acpi_sb_notify, NULL))) return -EINVAL; return 0; } /* -------------------------------------------------------------------------- Device Matching -------------------------------------------------------------------------- */ /** * acpi_get_first_physical_node - Get first physical node of an ACPI device * @adev: ACPI device in question * * Return: First physical node of ACPI device @adev */ struct device *acpi_get_first_physical_node(struct acpi_device *adev) { struct mutex *physical_node_lock = &adev->physical_node_lock; struct device *phys_dev; mutex_lock(physical_node_lock); if (list_empty(&adev->physical_node_list)) { phys_dev = NULL; } else { const struct acpi_device_physical_node *node; node = list_first_entry(&adev->physical_node_list, struct acpi_device_physical_node, node); phys_dev = node->dev; } mutex_unlock(physical_node_lock); return phys_dev; } EXPORT_SYMBOL_GPL(acpi_get_first_physical_node); static struct acpi_device *acpi_primary_dev_companion(struct acpi_device *adev, const struct device *dev) { const struct device *phys_dev = acpi_get_first_physical_node(adev); return phys_dev && phys_dev == dev ? adev : NULL; } /** * acpi_device_is_first_physical_node - Is given dev first physical node * @adev: ACPI companion device * @dev: Physical device to check * * Function checks if given @dev is the first physical devices attached to * the ACPI companion device. This distinction is needed in some cases * where the same companion device is shared between many physical devices. * * Note that the caller have to provide valid @adev pointer. */ bool acpi_device_is_first_physical_node(struct acpi_device *adev, const struct device *dev) { return !!acpi_primary_dev_companion(adev, dev); } /* * acpi_companion_match() - Can we match via ACPI companion device * @dev: Device in question * * Check if the given device has an ACPI companion and if that companion has * a valid list of PNP IDs, and if the device is the first (primary) physical * device associated with it. Return the companion pointer if that's the case * or NULL otherwise. * * If multiple physical devices are attached to a single ACPI companion, we need * to be careful. The usage scenario for this kind of relationship is that all * of the physical devices in question use resources provided by the ACPI * companion. A typical case is an MFD device where all the sub-devices share * the parent's ACPI companion. In such cases we can only allow the primary * (first) physical device to be matched with the help of the companion's PNP * IDs. * * Additional physical devices sharing the ACPI companion can still use * resources available from it but they will be matched normally using functions * provided by their bus types (and analogously for their modalias). */ const struct acpi_device *acpi_companion_match(const struct device *dev) { struct acpi_device *adev; adev = ACPI_COMPANION(dev); if (!adev) return NULL; if (list_empty(&adev->pnp.ids)) return NULL; return acpi_primary_dev_companion(adev, dev); } /** * acpi_of_match_device - Match device object using the "compatible" property. * @adev: ACPI device object to match. * @of_match_table: List of device IDs to match against. * @of_id: OF ID if matched * * If @dev has an ACPI companion which has ACPI_DT_NAMESPACE_HID in its list of * identifiers and a _DSD object with the "compatible" property, use that * property to match against the given list of identifiers. */ static bool acpi_of_match_device(const struct acpi_device *adev, const struct of_device_id *of_match_table, const struct of_device_id **of_id) { const union acpi_object *of_compatible, *obj; int i, nval; if (!adev) return false; of_compatible = adev->data.of_compatible; if (!of_match_table || !of_compatible) return false; if (of_compatible->type == ACPI_TYPE_PACKAGE) { nval = of_compatible->package.count; obj = of_compatible->package.elements; } else { /* Must be ACPI_TYPE_STRING. */ nval = 1; obj = of_compatible; } /* Now we can look for the driver DT compatible strings */ for (i = 0; i < nval; i++, obj++) { const struct of_device_id *id; for (id = of_match_table; id->compatible[0]; id++) if (!strcasecmp(obj->string.pointer, id->compatible)) { if (of_id) *of_id = id; return true; } } return false; } static bool acpi_of_modalias(struct acpi_device *adev, char *modalias, size_t len) { const union acpi_object *of_compatible; const union acpi_object *obj; const char *str, *chr; of_compatible = adev->data.of_compatible; if (!of_compatible) return false; if (of_compatible->type == ACPI_TYPE_PACKAGE) obj = of_compatible->package.elements; else /* Must be ACPI_TYPE_STRING. */ obj = of_compatible; str = obj->string.pointer; chr = strchr(str, ','); strscpy(modalias, chr ? chr + 1 : str, len); return true; } /** * acpi_set_modalias - Set modalias using "compatible" property or supplied ID * @adev: ACPI device object to match * @default_id: ID string to use as default if no compatible string found * @modalias: Pointer to buffer that modalias value will be copied into * @len: Length of modalias buffer * * This is a counterpart of of_alias_from_compatible() for struct acpi_device * objects. If there is a compatible string for @adev, it will be copied to * @modalias with the vendor prefix stripped; otherwise, @default_id will be * used. */ void acpi_set_modalias(struct acpi_device *adev, const char *default_id, char *modalias, size_t len) { if (!acpi_of_modalias(adev, modalias, len)) strscpy(modalias, default_id, len); } EXPORT_SYMBOL_GPL(acpi_set_modalias); static bool __acpi_match_device_cls(const struct acpi_device_id *id, struct acpi_hardware_id *hwid) { int i, msk, byte_shift; char buf[3]; if (!id->cls) return false; /* Apply class-code bitmask, before checking each class-code byte */ for (i = 1; i <= 3; i++) { byte_shift = 8 * (3 - i); msk = (id->cls_msk >> byte_shift) & 0xFF; if (!msk) continue; sprintf(buf, "%02x", (id->cls >> byte_shift) & msk); if (strncmp(buf, &hwid->id[(i - 1) * 2], 2)) return false; } return true; } static bool __acpi_match_device(const struct acpi_device *device, const struct acpi_device_id *acpi_ids, const struct of_device_id *of_ids, const struct acpi_device_id **acpi_id, const struct of_device_id **of_id) { const struct acpi_device_id *id; struct acpi_hardware_id *hwid; /* * If the device is not present, it is unnecessary to load device * driver for it. */ if (!device || !device->status.present) return false; list_for_each_entry(hwid, &device->pnp.ids, list) { /* First, check the ACPI/PNP IDs provided by the caller. */ if (acpi_ids) { for (id = acpi_ids; id->id[0] || id->cls; id++) { if (id->id[0] && !strcmp((char *)id->id, hwid->id)) goto out_acpi_match; if (id->cls && __acpi_match_device_cls(id, hwid)) goto out_acpi_match; } } /* * Next, check ACPI_DT_NAMESPACE_HID and try to match the * "compatible" property if found. */ if (!strcmp(ACPI_DT_NAMESPACE_HID, hwid->id)) return acpi_of_match_device(device, of_ids, of_id); } return false; out_acpi_match: if (acpi_id) *acpi_id = id; return true; } /** * acpi_match_acpi_device - Match an ACPI device against a given list of ACPI IDs * @ids: Array of struct acpi_device_id objects to match against. * @adev: The ACPI device pointer to match. * * Match the ACPI device @adev against a given list of ACPI IDs @ids. * * Return: * a pointer to the first matching ACPI ID on success or %NULL on failure. */ const struct acpi_device_id *acpi_match_acpi_device(const struct acpi_device_id *ids, const struct acpi_device *adev) { const struct acpi_device_id *id = NULL; __acpi_match_device(adev, ids, NULL, &id, NULL); return id; } EXPORT_SYMBOL_GPL(acpi_match_acpi_device); /** * acpi_match_device - Match a struct device against a given list of ACPI IDs * @ids: Array of struct acpi_device_id object to match against. * @dev: The device structure to match. * * Check if @dev has a valid ACPI handle and if there is a struct acpi_device * object for that handle and use that object to match against a given list of * device IDs. * * Return a pointer to the first matching ID on success or %NULL on failure. */ const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, const struct device *dev) { return acpi_match_acpi_device(ids, acpi_companion_match(dev)); } EXPORT_SYMBOL_GPL(acpi_match_device); static const void *acpi_of_device_get_match_data(const struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); const struct of_device_id *match = NULL; if (!acpi_of_match_device(adev, dev->driver->of_match_table, &match)) return NULL; return match->data; } const void *acpi_device_get_match_data(const struct device *dev) { const struct acpi_device_id *acpi_ids = dev->driver->acpi_match_table; const struct acpi_device_id *match; if (!acpi_ids) return acpi_of_device_get_match_data(dev); match = acpi_match_device(acpi_ids, dev); if (!match) return NULL; return (const void *)match->driver_data; } EXPORT_SYMBOL_GPL(acpi_device_get_match_data); int acpi_match_device_ids(struct acpi_device *device, const struct acpi_device_id *ids) { return __acpi_match_device(device, ids, NULL, NULL, NULL) ? 0 : -ENOENT; } EXPORT_SYMBOL(acpi_match_device_ids); bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv) { const struct acpi_device_id *acpi_ids = drv->acpi_match_table; const struct of_device_id *of_ids = drv->of_match_table; if (!acpi_ids) return acpi_of_match_device(ACPI_COMPANION(dev), of_ids, NULL); return __acpi_match_device(acpi_companion_match(dev), acpi_ids, of_ids, NULL, NULL); } EXPORT_SYMBOL_GPL(acpi_driver_match_device); /* -------------------------------------------------------------------------- ACPI Driver Management -------------------------------------------------------------------------- */ /** * __acpi_bus_register_driver - register a driver with the ACPI bus * @driver: driver being registered * @owner: owning module/driver * * Registers a driver with the ACPI bus. Searches the namespace for all * devices that match the driver's criteria and binds. Returns zero for * success or a negative error status for failure. */ int __acpi_bus_register_driver(struct acpi_driver *driver, struct module *owner) { if (acpi_disabled) return -ENODEV; driver->drv.name = driver->name; driver->drv.bus = &acpi_bus_type; driver->drv.owner = owner; return driver_register(&driver->drv); } EXPORT_SYMBOL(__acpi_bus_register_driver); /** * acpi_bus_unregister_driver - unregisters a driver with the ACPI bus * @driver: driver to unregister * * Unregisters a driver with the ACPI bus. Searches the namespace for all * devices that match the driver's criteria and unbinds. */ void acpi_bus_unregister_driver(struct acpi_driver *driver) { driver_unregister(&driver->drv); } EXPORT_SYMBOL(acpi_bus_unregister_driver); /* -------------------------------------------------------------------------- ACPI Bus operations -------------------------------------------------------------------------- */ static int acpi_bus_match(struct device *dev, const struct device_driver *drv) { struct acpi_device *acpi_dev = to_acpi_device(dev); const struct acpi_driver *acpi_drv = to_acpi_driver(drv); return acpi_dev->flags.match_driver && !acpi_match_device_ids(acpi_dev, acpi_drv->ids); } static int acpi_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { return __acpi_device_uevent_modalias(to_acpi_device(dev), env); } static int acpi_device_probe(struct device *dev) { struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_driver *acpi_drv = to_acpi_driver(dev->driver); int ret; if (acpi_dev->handler && !acpi_is_pnp_device(acpi_dev)) return -EINVAL; if (!acpi_drv->ops.add) return -ENOSYS; ret = acpi_drv->ops.add(acpi_dev); if (ret) { acpi_dev->driver_data = NULL; return ret; } pr_debug("Driver [%s] successfully bound to device [%s]\n", acpi_drv->name, acpi_dev->pnp.bus_id); if (acpi_drv->ops.notify) { ret = acpi_device_install_notify_handler(acpi_dev, acpi_drv); if (ret) { if (acpi_drv->ops.remove) acpi_drv->ops.remove(acpi_dev); acpi_dev->driver_data = NULL; return ret; } } pr_debug("Found driver [%s] for device [%s]\n", acpi_drv->name, acpi_dev->pnp.bus_id); get_device(dev); return 0; } static void acpi_device_remove(struct device *dev) { struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_driver *acpi_drv = to_acpi_driver(dev->driver); if (acpi_drv->ops.notify) acpi_device_remove_notify_handler(acpi_dev, acpi_drv); if (acpi_drv->ops.remove) acpi_drv->ops.remove(acpi_dev); acpi_dev->driver_data = NULL; put_device(dev); } const struct bus_type acpi_bus_type = { .name = "acpi", .match = acpi_bus_match, .probe = acpi_device_probe, .remove = acpi_device_remove, .uevent = acpi_device_uevent, }; int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data) { return bus_for_each_dev(&acpi_bus_type, NULL, data, fn); } EXPORT_SYMBOL_GPL(acpi_bus_for_each_dev); struct acpi_dev_walk_context { int (*fn)(struct acpi_device *, void *); void *data; }; static int acpi_dev_for_one_check(struct device *dev, void *context) { struct acpi_dev_walk_context *adwc = context; if (dev->bus != &acpi_bus_type) return 0; return adwc->fn(to_acpi_device(dev), adwc->data); } EXPORT_SYMBOL_GPL(acpi_dev_for_each_child); int acpi_dev_for_each_child(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data) { struct acpi_dev_walk_context adwc = { .fn = fn, .data = data, }; return device_for_each_child(&adev->dev, &adwc, acpi_dev_for_one_check); } int acpi_dev_for_each_child_reverse(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data) { struct acpi_dev_walk_context adwc = { .fn = fn, .data = data, }; return device_for_each_child_reverse(&adev->dev, &adwc, acpi_dev_for_one_check); } /* -------------------------------------------------------------------------- Initialization/Cleanup -------------------------------------------------------------------------- */ static int __init acpi_bus_init_irq(void) { acpi_status status; char *message = NULL; /* * Let the system know what interrupt model we are using by * evaluating the \_PIC object, if exists. */ switch (acpi_irq_model) { case ACPI_IRQ_MODEL_PIC: message = "PIC"; break; case ACPI_IRQ_MODEL_IOAPIC: message = "IOAPIC"; break; case ACPI_IRQ_MODEL_IOSAPIC: message = "IOSAPIC"; break; case ACPI_IRQ_MODEL_GIC: message = "GIC"; break; case ACPI_IRQ_MODEL_PLATFORM: message = "platform specific model"; break; case ACPI_IRQ_MODEL_LPIC: message = "LPIC"; break; case ACPI_IRQ_MODEL_RINTC: message = "RINTC"; break; default: pr_info("Unknown interrupt routing model\n"); return -ENODEV; } pr_info("Using %s for interrupt routing\n", message); status = acpi_execute_simple_method(NULL, "\\_PIC", acpi_irq_model); if (ACPI_FAILURE(status) && (status != AE_NOT_FOUND)) { pr_info("_PIC evaluation failed: %s\n", acpi_format_exception(status)); return -ENODEV; } return 0; } /** * acpi_early_init - Initialize ACPICA and populate the ACPI namespace. * * The ACPI tables are accessible after this, but the handling of events has not * been initialized and the global lock is not available yet, so AML should not * be executed at this point. * * Doing this before switching the EFI runtime services to virtual mode allows * the EfiBootServices memory to be freed slightly earlier on boot. */ void __init acpi_early_init(void) { acpi_status status; if (acpi_disabled) return; pr_info("Core revision %08x\n", ACPI_CA_VERSION); /* enable workarounds, unless strict ACPI spec. compliance */ if (!acpi_strict) acpi_gbl_enable_interpreter_slack = TRUE; acpi_permanent_mmap = true; #ifdef CONFIG_X86 /* * If the machine falls into the DMI check table, * DSDT will be copied to memory. * Note that calling dmi_check_system() here on other architectures * would not be OK because only x86 initializes dmi early enough. * Thankfully only x86 systems need such quirks for now. */ dmi_check_system(dsdt_dmi_table); #endif status = acpi_reallocate_root_table(); if (ACPI_FAILURE(status)) { pr_err("Unable to reallocate ACPI tables\n"); goto error0; } status = acpi_initialize_subsystem(); if (ACPI_FAILURE(status)) { pr_err("Unable to initialize the ACPI Interpreter\n"); goto error0; } #ifdef CONFIG_X86 if (!acpi_ioapic) { /* compatible (0) means level (3) */ if (!(acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)) { acpi_sci_flags &= ~ACPI_MADT_TRIGGER_MASK; acpi_sci_flags |= ACPI_MADT_TRIGGER_LEVEL; } /* Set PIC-mode SCI trigger type */ acpi_pic_sci_set_trigger(acpi_gbl_FADT.sci_interrupt, (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2); } else { /* * now that acpi_gbl_FADT is initialized, * update it with result from INT_SRC_OVR parsing */ acpi_gbl_FADT.sci_interrupt = acpi_sci_override_gsi; } #endif return; error0: disable_acpi(); } /** * acpi_subsystem_init - Finalize the early initialization of ACPI. * * Switch over the platform to the ACPI mode (if possible). * * Doing this too early is generally unsafe, but at the same time it needs to be * done before all things that really depend on ACPI. The right spot appears to * be before finalizing the EFI initialization. */ void __init acpi_subsystem_init(void) { acpi_status status; if (acpi_disabled) return; status = acpi_enable_subsystem(~ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { pr_err("Unable to enable ACPI\n"); disable_acpi(); } else { /* * If the system is using ACPI then we can be reasonably * confident that any regulators are managed by the firmware * so tell the regulator core it has everything it needs to * know. */ regulator_has_full_constraints(); } } static acpi_status acpi_bus_table_handler(u32 event, void *table, void *context) { if (event == ACPI_TABLE_EVENT_LOAD) acpi_scan_table_notify(); return acpi_sysfs_table_handler(event, table, context); } static int __init acpi_bus_init(void) { int result; acpi_status status; acpi_os_initialize1(); status = acpi_load_tables(); if (ACPI_FAILURE(status)) { pr_err("Unable to load the System Description Tables\n"); goto error1; } /* * ACPI 2.0 requires the EC driver to be loaded and work before the EC * device is found in the namespace. * * This is accomplished by looking for the ECDT table and getting the EC * parameters out of that. * * Do that before calling acpi_initialize_objects() which may trigger EC * address space accesses. */ acpi_ec_ecdt_probe(); status = acpi_enable_subsystem(ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { pr_err("Unable to start the ACPI Interpreter\n"); goto error1; } status = acpi_initialize_objects(ACPI_FULL_INITIALIZATION); if (ACPI_FAILURE(status)) { pr_err("Unable to initialize ACPI objects\n"); goto error1; } /* * _OSC method may exist in module level code, * so it must be run after ACPI_FULL_INITIALIZATION */ acpi_bus_osc_negotiate_platform_control(); acpi_bus_osc_negotiate_usb_control(); /* * _PDC control method may load dynamic SSDT tables, * and we need to install the table handler before that. */ status = acpi_install_table_handler(acpi_bus_table_handler, NULL); acpi_sysfs_init(); acpi_early_processor_control_setup(); /* * Maybe EC region is required at bus_scan/acpi_get_devices. So it * is necessary to enable it as early as possible. */ acpi_ec_dsdt_probe(); pr_info("Interpreter enabled\n"); /* Initialize sleep structures */ acpi_sleep_init(); /* * Get the system interrupt model and evaluate \_PIC. */ result = acpi_bus_init_irq(); if (result) goto error1; /* * Register for all standard device notifications. */ status = acpi_install_notify_handler(ACPI_ROOT_OBJECT, ACPI_SYSTEM_NOTIFY, &acpi_bus_notify, NULL); if (ACPI_FAILURE(status)) { pr_err("Unable to register for system notifications\n"); goto error1; } /* * Create the top ACPI proc directory */ acpi_root_dir = proc_mkdir(ACPI_BUS_FILE_ROOT, NULL); result = bus_register(&acpi_bus_type); if (!result) return 0; /* Mimic structured exception handling */ error1: acpi_terminate(); return -ENODEV; } struct kobject *acpi_kobj; EXPORT_SYMBOL_GPL(acpi_kobj); void __weak __init acpi_arch_init(void) { } static int __init acpi_init(void) { int result; if (acpi_disabled) { pr_info("Interpreter disabled.\n"); return -ENODEV; } acpi_kobj = kobject_create_and_add("acpi", firmware_kobj); if (!acpi_kobj) { pr_err("Failed to register kobject\n"); return -ENOMEM; } init_prmt(); acpi_init_pcc(); result = acpi_bus_init(); if (result) { kobject_put(acpi_kobj); disable_acpi(); return result; } acpi_init_ffh(); pci_mmcfg_late_init(); acpi_viot_early_init(); acpi_hest_init(); acpi_ghes_init(); acpi_arch_init(); acpi_scan_init(); acpi_ec_init(); acpi_debugfs_init(); acpi_sleep_proc_init(); acpi_wakeup_device_init(); acpi_debugger_init(); acpi_setup_sb_notify_handler(); acpi_viot_init(); return 0; } subsys_initcall(acpi_init); |
| 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/affs/bitmap.c * * (c) 1996 Hans-Joachim Widmaier * * bitmap.c contains the code that handles all bitmap related stuff - * block allocation, deallocation, calculation of free space. */ #include <linux/slab.h> #include "affs.h" u32 affs_count_free_blocks(struct super_block *sb) { struct affs_bm_info *bm; u32 free; int i; pr_debug("%s()\n", __func__); if (sb_rdonly(sb)) return 0; mutex_lock(&AFFS_SB(sb)->s_bmlock); bm = AFFS_SB(sb)->s_bitmap; free = 0; for (i = AFFS_SB(sb)->s_bmap_count; i > 0; bm++, i--) free += bm->bm_free; mutex_unlock(&AFFS_SB(sb)->s_bmlock); return free; } void affs_free_block(struct super_block *sb, u32 block) { struct affs_sb_info *sbi = AFFS_SB(sb); struct affs_bm_info *bm; struct buffer_head *bh; u32 blk, bmap, bit, mask, tmp; __be32 *data; pr_debug("%s(%u)\n", __func__, block); if (block > sbi->s_partition_size) goto err_range; blk = block - sbi->s_reserved; bmap = blk / sbi->s_bmap_bits; bit = blk % sbi->s_bmap_bits; bm = &sbi->s_bitmap[bmap]; mutex_lock(&sbi->s_bmlock); bh = sbi->s_bmap_bh; if (sbi->s_last_bmap != bmap) { affs_brelse(bh); bh = affs_bread(sb, bm->bm_key); if (!bh) goto err_bh_read; sbi->s_bmap_bh = bh; sbi->s_last_bmap = bmap; } mask = 1 << (bit & 31); data = (__be32 *)bh->b_data + bit / 32 + 1; /* mark block free */ tmp = be32_to_cpu(*data); if (tmp & mask) goto err_free; *data = cpu_to_be32(tmp | mask); /* fix checksum */ tmp = be32_to_cpu(*(__be32 *)bh->b_data); *(__be32 *)bh->b_data = cpu_to_be32(tmp - mask); mark_buffer_dirty(bh); affs_mark_sb_dirty(sb); bm->bm_free++; mutex_unlock(&sbi->s_bmlock); return; err_free: affs_warning(sb,"affs_free_block","Trying to free block %u which is already free", block); mutex_unlock(&sbi->s_bmlock); return; err_bh_read: affs_error(sb,"affs_free_block","Cannot read bitmap block %u", bm->bm_key); sbi->s_bmap_bh = NULL; sbi->s_last_bmap = ~0; mutex_unlock(&sbi->s_bmlock); return; err_range: affs_error(sb, "affs_free_block","Block %u outside partition", block); } /* * Allocate a block in the given allocation zone. * Since we have to byte-swap the bitmap on little-endian * machines, this is rather expensive. Therefore we will * preallocate up to 16 blocks from the same word, if * possible. We are not doing preallocations in the * header zone, though. */ u32 affs_alloc_block(struct inode *inode, u32 goal) { struct super_block *sb; struct affs_sb_info *sbi; struct affs_bm_info *bm; struct buffer_head *bh; __be32 *data, *enddata; u32 blk, bmap, bit, mask, mask2, tmp; int i; sb = inode->i_sb; sbi = AFFS_SB(sb); pr_debug("balloc(inode=%lu,goal=%u): ", inode->i_ino, goal); if (AFFS_I(inode)->i_pa_cnt) { pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1); AFFS_I(inode)->i_pa_cnt--; return ++AFFS_I(inode)->i_lastalloc; } if (!goal || goal > sbi->s_partition_size) { if (goal) affs_warning(sb, "affs_balloc", "invalid goal %d", goal); //if (!AFFS_I(inode)->i_last_block) // affs_warning(sb, "affs_balloc", "no last alloc block"); goal = sbi->s_reserved; } blk = goal - sbi->s_reserved; bmap = blk / sbi->s_bmap_bits; bm = &sbi->s_bitmap[bmap]; mutex_lock(&sbi->s_bmlock); if (bm->bm_free) goto find_bmap_bit; find_bmap: /* search for the next bmap buffer with free bits */ i = sbi->s_bmap_count; do { if (--i < 0) goto err_full; bmap++; bm++; if (bmap < sbi->s_bmap_count) continue; /* restart search at zero */ bmap = 0; bm = sbi->s_bitmap; } while (!bm->bm_free); blk = bmap * sbi->s_bmap_bits; find_bmap_bit: bh = sbi->s_bmap_bh; if (sbi->s_last_bmap != bmap) { affs_brelse(bh); bh = affs_bread(sb, bm->bm_key); if (!bh) goto err_bh_read; sbi->s_bmap_bh = bh; sbi->s_last_bmap = bmap; } /* find an unused block in this bitmap block */ bit = blk % sbi->s_bmap_bits; data = (__be32 *)bh->b_data + bit / 32 + 1; enddata = (__be32 *)((u8 *)bh->b_data + sb->s_blocksize); mask = ~0UL << (bit & 31); blk &= ~31UL; tmp = be32_to_cpu(*data); if (tmp & mask) goto find_bit; /* scan the rest of the buffer */ do { blk += 32; if (++data >= enddata) /* didn't find something, can only happen * if scan didn't start at 0, try next bmap */ goto find_bmap; } while (!*data); tmp = be32_to_cpu(*data); mask = ~0; find_bit: /* finally look for a free bit in the word */ bit = ffs(tmp & mask) - 1; blk += bit + sbi->s_reserved; mask2 = mask = 1 << (bit & 31); AFFS_I(inode)->i_lastalloc = blk; /* prealloc as much as possible within this word */ while ((mask2 <<= 1)) { if (!(tmp & mask2)) break; AFFS_I(inode)->i_pa_cnt++; mask |= mask2; } bm->bm_free -= AFFS_I(inode)->i_pa_cnt + 1; *data = cpu_to_be32(tmp & ~mask); /* fix checksum */ tmp = be32_to_cpu(*(__be32 *)bh->b_data); *(__be32 *)bh->b_data = cpu_to_be32(tmp + mask); mark_buffer_dirty(bh); affs_mark_sb_dirty(sb); mutex_unlock(&sbi->s_bmlock); pr_debug("%d\n", blk); return blk; err_bh_read: affs_error(sb,"affs_read_block","Cannot read bitmap block %u", bm->bm_key); sbi->s_bmap_bh = NULL; sbi->s_last_bmap = ~0; err_full: mutex_unlock(&sbi->s_bmlock); pr_debug("failed\n"); return 0; } int affs_init_bitmap(struct super_block *sb, int *flags) { struct affs_bm_info *bm; struct buffer_head *bmap_bh = NULL, *bh = NULL; __be32 *bmap_blk; u32 size, blk, end, offset, mask; int i, res = 0; struct affs_sb_info *sbi = AFFS_SB(sb); if (*flags & SB_RDONLY) return 0; if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) { pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id); *flags |= SB_RDONLY; return 0; } sbi->s_last_bmap = ~0; sbi->s_bmap_bh = NULL; sbi->s_bmap_bits = sb->s_blocksize * 8 - 32; sbi->s_bmap_count = (sbi->s_partition_size - sbi->s_reserved + sbi->s_bmap_bits - 1) / sbi->s_bmap_bits; size = sbi->s_bmap_count * sizeof(*bm); bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL); if (!sbi->s_bitmap) { pr_err("Bitmap allocation failed\n"); return -ENOMEM; } bmap_blk = (__be32 *)sbi->s_root_bh->b_data; blk = sb->s_blocksize / 4 - 49; end = blk + 25; for (i = sbi->s_bmap_count; i > 0; bm++, i--) { affs_brelse(bh); bm->bm_key = be32_to_cpu(bmap_blk[blk]); bh = affs_bread(sb, bm->bm_key); if (!bh) { pr_err("Cannot read bitmap\n"); res = -EIO; goto out; } if (affs_checksum_block(sb, bh)) { pr_warn("Bitmap %u invalid - mounting %s read only.\n", bm->bm_key, sb->s_id); *flags |= SB_RDONLY; goto out; } pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key); bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4); /* Don't try read the extension if this is the last block, * but we also need the right bm pointer below */ if (++blk < end || i == 1) continue; if (bmap_bh) affs_brelse(bmap_bh); bmap_bh = affs_bread(sb, be32_to_cpu(bmap_blk[blk])); if (!bmap_bh) { pr_err("Cannot read bitmap extension\n"); res = -EIO; goto out; } bmap_blk = (__be32 *)bmap_bh->b_data; blk = 0; end = sb->s_blocksize / 4 - 1; } offset = (sbi->s_partition_size - sbi->s_reserved) % sbi->s_bmap_bits; mask = ~(0xFFFFFFFFU << (offset & 31)); pr_debug("last word: %d %d %d\n", offset, offset / 32 + 1, mask); offset = offset / 32 + 1; if (mask) { u32 old, new; /* Mark unused bits in the last word as allocated */ old = be32_to_cpu(((__be32 *)bh->b_data)[offset]); new = old & mask; //if (old != new) { ((__be32 *)bh->b_data)[offset] = cpu_to_be32(new); /* fix checksum */ //new -= old; //old = be32_to_cpu(*(__be32 *)bh->b_data); //*(__be32 *)bh->b_data = cpu_to_be32(old - new); //mark_buffer_dirty(bh); //} /* correct offset for the bitmap count below */ //offset++; } while (++offset < sb->s_blocksize / 4) ((__be32 *)bh->b_data)[offset] = 0; ((__be32 *)bh->b_data)[0] = 0; ((__be32 *)bh->b_data)[0] = cpu_to_be32(-affs_checksum_block(sb, bh)); mark_buffer_dirty(bh); /* recalculate bitmap count for last block */ bm--; bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4); out: affs_brelse(bh); affs_brelse(bmap_bh); return res; } void affs_free_bitmap(struct super_block *sb) { struct affs_sb_info *sbi = AFFS_SB(sb); if (!sbi->s_bitmap) return; affs_brelse(sbi->s_bmap_bh); sbi->s_bmap_bh = NULL; sbi->s_last_bmap = ~0; kfree(sbi->s_bitmap); sbi->s_bitmap = NULL; } |
| 32 32 32 32 32 32 32 12 39 39 38 39 39 39 34 34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | // SPDX-License-Identifier: GPL-2.0 /* dvb-usb-urb.c is part of the DVB USB library. * * Copyright (C) 2004-6 Patrick Boettcher (patrick.boettcher@posteo.de) * see dvb-usb-init.c for copyright information. * * This file keeps functions for initializing and handling the * USB and URB stuff. */ #include "dvb-usb-common.h" int dvb_usb_generic_rw(struct dvb_usb_device *d, u8 *wbuf, u16 wlen, u8 *rbuf, u16 rlen, int delay_ms) { int actlen = 0, ret = -ENOMEM; if (!d || wbuf == NULL || wlen == 0) return -EINVAL; if (d->props.generic_bulk_ctrl_endpoint == 0) { err("endpoint for generic control not specified."); return -EINVAL; } if ((ret = mutex_lock_interruptible(&d->usb_mutex))) return ret; deb_xfer(">>> "); debug_dump(wbuf,wlen,deb_xfer); ret = usb_bulk_msg(d->udev,usb_sndbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint), wbuf,wlen,&actlen, 2000); if (ret) err("bulk message failed: %d (%d/%d)",ret,wlen,actlen); else ret = actlen != wlen ? -1 : 0; /* an answer is expected, and no error before */ if (!ret && rbuf && rlen) { if (delay_ms) msleep(delay_ms); ret = usb_bulk_msg(d->udev,usb_rcvbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint_response ? d->props.generic_bulk_ctrl_endpoint_response : d->props.generic_bulk_ctrl_endpoint),rbuf,rlen,&actlen, 2000); if (ret) err("recv bulk message failed: %d",ret); else { deb_xfer("<<< "); debug_dump(rbuf,actlen,deb_xfer); } } mutex_unlock(&d->usb_mutex); return ret; } EXPORT_SYMBOL(dvb_usb_generic_rw); int dvb_usb_generic_write(struct dvb_usb_device *d, u8 *buf, u16 len) { return dvb_usb_generic_rw(d,buf,len,NULL,0,0); } EXPORT_SYMBOL(dvb_usb_generic_write); static void dvb_usb_data_complete(struct usb_data_stream *stream, u8 *buffer, size_t length) { struct dvb_usb_adapter *adap = stream->user_priv; if (adap->feedcount > 0 && adap->state & DVB_USB_ADAP_STATE_DVB) dvb_dmx_swfilter(&adap->demux, buffer, length); } static void dvb_usb_data_complete_204(struct usb_data_stream *stream, u8 *buffer, size_t length) { struct dvb_usb_adapter *adap = stream->user_priv; if (adap->feedcount > 0 && adap->state & DVB_USB_ADAP_STATE_DVB) dvb_dmx_swfilter_204(&adap->demux, buffer, length); } static void dvb_usb_data_complete_raw(struct usb_data_stream *stream, u8 *buffer, size_t length) { struct dvb_usb_adapter *adap = stream->user_priv; if (adap->feedcount > 0 && adap->state & DVB_USB_ADAP_STATE_DVB) dvb_dmx_swfilter_raw(&adap->demux, buffer, length); } int dvb_usb_adapter_stream_init(struct dvb_usb_adapter *adap) { int i, ret = 0; for (i = 0; i < adap->props.num_frontends; i++) { adap->fe_adap[i].stream.udev = adap->dev->udev; if (adap->props.fe[i].caps & DVB_USB_ADAP_RECEIVES_204_BYTE_TS) adap->fe_adap[i].stream.complete = dvb_usb_data_complete_204; else if (adap->props.fe[i].caps & DVB_USB_ADAP_RECEIVES_RAW_PAYLOAD) adap->fe_adap[i].stream.complete = dvb_usb_data_complete_raw; else adap->fe_adap[i].stream.complete = dvb_usb_data_complete; adap->fe_adap[i].stream.user_priv = adap; ret = usb_urb_init(&adap->fe_adap[i].stream, &adap->props.fe[i].stream); if (ret < 0) break; } return ret; } int dvb_usb_adapter_stream_exit(struct dvb_usb_adapter *adap) { int i; for (i = 0; i < adap->props.num_frontends; i++) usb_urb_exit(&adap->fe_adap[i].stream); return 0; } |
| 11 11 11 11 11 11 10 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 | /* * Copyright (c) 2004-2011 Atheros Communications Inc. * Copyright (c) 2011-2012 Qualcomm Atheros, Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "core.h" #include "hif-ops.h" #include "target.h" #include "debug.h" int ath6kl_bmi_done(struct ath6kl *ar) { int ret; u32 cid = BMI_DONE; if (ar->bmi.done_sent) { ath6kl_dbg(ATH6KL_DBG_BMI, "bmi done skipped\n"); return 0; } ar->bmi.done_sent = true; ret = ath6kl_hif_bmi_write(ar, (u8 *)&cid, sizeof(cid)); if (ret) { ath6kl_err("Unable to send bmi done: %d\n", ret); return ret; } return 0; } int ath6kl_bmi_get_target_info(struct ath6kl *ar, struct ath6kl_bmi_target_info *targ_info) { int ret; u32 cid = BMI_GET_TARGET_INFO; if (ar->bmi.done_sent) { ath6kl_err("bmi done sent already, cmd %d disallowed\n", cid); return -EACCES; } ret = ath6kl_hif_bmi_write(ar, (u8 *)&cid, sizeof(cid)); if (ret) { ath6kl_err("Unable to send get target info: %d\n", ret); return ret; } if (ar->hif_type == ATH6KL_HIF_TYPE_USB) { ret = ath6kl_hif_bmi_read(ar, (u8 *)targ_info, sizeof(*targ_info)); } else { ret = ath6kl_hif_bmi_read(ar, (u8 *)&targ_info->version, sizeof(targ_info->version)); } if (ret) { ath6kl_err("Unable to recv target info: %d\n", ret); return ret; } if (le32_to_cpu(targ_info->version) == TARGET_VERSION_SENTINAL) { /* Determine how many bytes are in the Target's targ_info */ ret = ath6kl_hif_bmi_read(ar, (u8 *)&targ_info->byte_count, sizeof(targ_info->byte_count)); if (ret) { ath6kl_err("unable to read target info byte count: %d\n", ret); return ret; } /* * The target's targ_info doesn't match the host's targ_info. * We need to do some backwards compatibility to make this work. */ if (le32_to_cpu(targ_info->byte_count) != sizeof(*targ_info)) { ath6kl_err("mismatched byte count %d vs. expected %zd\n", le32_to_cpu(targ_info->byte_count), sizeof(*targ_info)); return -EINVAL; } /* Read the remainder of the targ_info */ ret = ath6kl_hif_bmi_read(ar, ((u8 *)targ_info) + sizeof(targ_info->byte_count), sizeof(*targ_info) - sizeof(targ_info->byte_count)); if (ret) { ath6kl_err("Unable to read target info (%d bytes): %d\n", targ_info->byte_count, ret); return ret; } } ath6kl_dbg(ATH6KL_DBG_BMI, "target info (ver: 0x%x type: 0x%x)\n", targ_info->version, targ_info->type); return 0; } int ath6kl_bmi_read(struct ath6kl *ar, u32 addr, u8 *buf, u32 len) { u32 cid = BMI_READ_MEMORY; int ret; u32 offset; u32 len_remain, rx_len; u16 size; if (ar->bmi.done_sent) { ath6kl_err("bmi done sent already, cmd %d disallowed\n", cid); return -EACCES; } size = ar->bmi.max_data_size + sizeof(cid) + sizeof(addr) + sizeof(len); if (size > ar->bmi.max_cmd_size) { WARN_ON(1); return -EINVAL; } memset(ar->bmi.cmd_buf, 0, size); ath6kl_dbg(ATH6KL_DBG_BMI, "bmi read memory: device: addr: 0x%x, len: %d\n", addr, len); len_remain = len; while (len_remain) { rx_len = (len_remain < ar->bmi.max_data_size) ? len_remain : ar->bmi.max_data_size; offset = 0; memcpy(&(ar->bmi.cmd_buf[offset]), &cid, sizeof(cid)); offset += sizeof(cid); memcpy(&(ar->bmi.cmd_buf[offset]), &addr, sizeof(addr)); offset += sizeof(addr); memcpy(&(ar->bmi.cmd_buf[offset]), &rx_len, sizeof(rx_len)); offset += sizeof(len); ret = ath6kl_hif_bmi_write(ar, ar->bmi.cmd_buf, offset); if (ret) { ath6kl_err("Unable to write to the device: %d\n", ret); return ret; } ret = ath6kl_hif_bmi_read(ar, ar->bmi.cmd_buf, rx_len); if (ret) { ath6kl_err("Unable to read from the device: %d\n", ret); return ret; } memcpy(&buf[len - len_remain], ar->bmi.cmd_buf, rx_len); len_remain -= rx_len; addr += rx_len; } return 0; } int ath6kl_bmi_write(struct ath6kl *ar, u32 addr, u8 *buf, u32 len) { u32 cid = BMI_WRITE_MEMORY; int ret; u32 offset; u32 len_remain, tx_len; const u32 header = sizeof(cid) + sizeof(addr) + sizeof(len); u8 aligned_buf[400]; u8 *src; if (ar->bmi.done_sent) { ath6kl_err("bmi done sent already, cmd %d disallowed\n", cid); return -EACCES; } if ((ar->bmi.max_data_size + header) > ar->bmi.max_cmd_size) { WARN_ON(1); return -EINVAL; } if (WARN_ON(ar->bmi.max_data_size > sizeof(aligned_buf))) return -E2BIG; memset(ar->bmi.cmd_buf, 0, ar->bmi.max_data_size + header); ath6kl_dbg(ATH6KL_DBG_BMI, "bmi write memory: addr: 0x%x, len: %d\n", addr, len); len_remain = len; while (len_remain) { src = &buf[len - len_remain]; if (len_remain < (ar->bmi.max_data_size - header)) { if (len_remain & 3) { /* align it with 4 bytes */ len_remain = len_remain + (4 - (len_remain & 3)); memcpy(aligned_buf, src, len_remain); src = aligned_buf; } tx_len = len_remain; } else { tx_len = (ar->bmi.max_data_size - header); } offset = 0; memcpy(&(ar->bmi.cmd_buf[offset]), &cid, sizeof(cid)); offset += sizeof(cid); memcpy(&(ar->bmi.cmd_buf[offset]), &addr, sizeof(addr)); offset += sizeof(addr); memcpy(&(ar->bmi.cmd_buf[offset]), &tx_len, sizeof(tx_len)); offset += sizeof(tx_len); memcpy(&(ar->bmi.cmd_buf[offset]), src, tx_len); offset += tx_len; ret = ath6kl_hif_bmi_write(ar, ar->bmi.cmd_buf, offset); if (ret) { ath6kl_err("Unable to write to the device: %d\n", ret); return ret; } len_remain -= tx_len; addr += tx_len; } return 0; } int ath6kl_bmi_execute(struct ath6kl *ar, u32 addr, u32 *param) { u32 cid = BMI_EXECUTE; int ret; u32 offset; u16 size; if (ar->bmi.done_sent) { ath6kl_err("bmi done sent already, cmd %d disallowed\n", cid); return -EACCES; } size = sizeof(cid) + sizeof(addr) + sizeof(*param); if (size > ar->bmi.max_cmd_size) { WARN_ON(1); return -EINVAL; } memset(ar->bmi.cmd_buf, 0, size); ath6kl_dbg(ATH6KL_DBG_BMI, "bmi execute: addr: 0x%x, param: %d)\n", addr, *param); offset = 0; memcpy(&(ar->bmi.cmd_buf[offset]), &cid, sizeof(cid)); offset += sizeof(cid); memcpy(&(ar->bmi.cmd_buf[offset]), &addr, sizeof(addr)); offset += sizeof(addr); memcpy(&(ar->bmi.cmd_buf[offset]), param, sizeof(*param)); offset += sizeof(*param); ret = ath6kl_hif_bmi_write(ar, ar->bmi.cmd_buf, offset); if (ret) { ath6kl_err("Unable to write to the device: %d\n", ret); return ret; } ret = ath6kl_hif_bmi_read(ar, ar->bmi.cmd_buf, sizeof(*param)); if (ret) { ath6kl_err("Unable to read from the device: %d\n", ret); return ret; } memcpy(param, ar->bmi.cmd_buf, sizeof(*param)); return 0; } int ath6kl_bmi_set_app_start(struct ath6kl *ar, u32 addr) { u32 cid = BMI_SET_APP_START; int ret; u32 offset; u16 size; if (ar->bmi.done_sent) { ath6kl_err("bmi done sent already, cmd %d disallowed\n", cid); return -EACCES; } size = sizeof(cid) + sizeof(addr); if (size > ar->bmi.max_cmd_size) { WARN_ON(1); return -EINVAL; } memset(ar->bmi.cmd_buf, 0, size); ath6kl_dbg(ATH6KL_DBG_BMI, "bmi set app start: addr: 0x%x\n", addr); offset = 0; memcpy(&(ar->bmi.cmd_buf[offset]), &cid, sizeof(cid)); offset += sizeof(cid); memcpy(&(ar->bmi.cmd_buf[offset]), &addr, sizeof(addr)); offset += sizeof(addr); ret = ath6kl_hif_bmi_write(ar, ar->bmi.cmd_buf, offset); if (ret) { ath6kl_err("Unable to write to the device: %d\n", ret); return ret; } return 0; } int ath6kl_bmi_reg_read(struct ath6kl *ar, u32 addr, u32 *param) { u32 cid = BMI_READ_SOC_REGISTER; int ret; u32 offset; u16 size; if (ar->bmi.done_sent) { ath6kl_err("bmi done sent already, cmd %d disallowed\n", cid); return -EACCES; } size = sizeof(cid) + sizeof(addr); if (size > ar->bmi.max_cmd_size) { WARN_ON(1); return -EINVAL; } memset(ar->bmi.cmd_buf, 0, size); ath6kl_dbg(ATH6KL_DBG_BMI, "bmi read SOC reg: addr: 0x%x\n", addr); offset = 0; memcpy(&(ar->bmi.cmd_buf[offset]), &cid, sizeof(cid)); offset += sizeof(cid); memcpy(&(ar->bmi.cmd_buf[offset]), &addr, sizeof(addr)); offset += sizeof(addr); ret = ath6kl_hif_bmi_write(ar, ar->bmi.cmd_buf, offset); if (ret) { ath6kl_err("Unable to write to the device: %d\n", ret); return ret; } ret = ath6kl_hif_bmi_read(ar, ar->bmi.cmd_buf, sizeof(*param)); if (ret) { ath6kl_err("Unable to read from the device: %d\n", ret); return ret; } memcpy(param, ar->bmi.cmd_buf, sizeof(*param)); return 0; } int ath6kl_bmi_reg_write(struct ath6kl *ar, u32 addr, u32 param) { u32 cid = BMI_WRITE_SOC_REGISTER; int ret; u32 offset; u16 size; if (ar->bmi.done_sent) { ath6kl_err("bmi done sent already, cmd %d disallowed\n", cid); return -EACCES; } size = sizeof(cid) + sizeof(addr) + sizeof(param); if (size > ar->bmi.max_cmd_size) { WARN_ON(1); return -EINVAL; } memset(ar->bmi.cmd_buf, 0, size); ath6kl_dbg(ATH6KL_DBG_BMI, "bmi write SOC reg: addr: 0x%x, param: %d\n", addr, param); offset = 0; memcpy(&(ar->bmi.cmd_buf[offset]), &cid, sizeof(cid)); offset += sizeof(cid); memcpy(&(ar->bmi.cmd_buf[offset]), &addr, sizeof(addr)); offset += sizeof(addr); memcpy(&(ar->bmi.cmd_buf[offset]), ¶m, sizeof(param)); offset += sizeof(param); ret = ath6kl_hif_bmi_write(ar, ar->bmi.cmd_buf, offset); if (ret) { ath6kl_err("Unable to write to the device: %d\n", ret); return ret; } return 0; } int ath6kl_bmi_lz_data(struct ath6kl *ar, u8 *buf, u32 len) { u32 cid = BMI_LZ_DATA; int ret; u32 offset; u32 len_remain, tx_len; const u32 header = sizeof(cid) + sizeof(len); u16 size; if (ar->bmi.done_sent) { ath6kl_err("bmi done sent already, cmd %d disallowed\n", cid); return -EACCES; } size = ar->bmi.max_data_size + header; if (size > ar->bmi.max_cmd_size) { WARN_ON(1); return -EINVAL; } memset(ar->bmi.cmd_buf, 0, size); ath6kl_dbg(ATH6KL_DBG_BMI, "bmi send LZ data: len: %d)\n", len); len_remain = len; while (len_remain) { tx_len = (len_remain < (ar->bmi.max_data_size - header)) ? len_remain : (ar->bmi.max_data_size - header); offset = 0; memcpy(&(ar->bmi.cmd_buf[offset]), &cid, sizeof(cid)); offset += sizeof(cid); memcpy(&(ar->bmi.cmd_buf[offset]), &tx_len, sizeof(tx_len)); offset += sizeof(tx_len); memcpy(&(ar->bmi.cmd_buf[offset]), &buf[len - len_remain], tx_len); offset += tx_len; ret = ath6kl_hif_bmi_write(ar, ar->bmi.cmd_buf, offset); if (ret) { ath6kl_err("Unable to write to the device: %d\n", ret); return ret; } len_remain -= tx_len; } return 0; } int ath6kl_bmi_lz_stream_start(struct ath6kl *ar, u32 addr) { u32 cid = BMI_LZ_STREAM_START; int ret; u32 offset; u16 size; if (ar->bmi.done_sent) { ath6kl_err("bmi done sent already, cmd %d disallowed\n", cid); return -EACCES; } size = sizeof(cid) + sizeof(addr); if (size > ar->bmi.max_cmd_size) { WARN_ON(1); return -EINVAL; } memset(ar->bmi.cmd_buf, 0, size); ath6kl_dbg(ATH6KL_DBG_BMI, "bmi LZ stream start: addr: 0x%x)\n", addr); offset = 0; memcpy(&(ar->bmi.cmd_buf[offset]), &cid, sizeof(cid)); offset += sizeof(cid); memcpy(&(ar->bmi.cmd_buf[offset]), &addr, sizeof(addr)); offset += sizeof(addr); ret = ath6kl_hif_bmi_write(ar, ar->bmi.cmd_buf, offset); if (ret) { ath6kl_err("Unable to start LZ stream to the device: %d\n", ret); return ret; } return 0; } int ath6kl_bmi_fast_download(struct ath6kl *ar, u32 addr, u8 *buf, u32 len) { int ret; u32 last_word = 0; u32 last_word_offset = len & ~0x3; u32 unaligned_bytes = len & 0x3; ret = ath6kl_bmi_lz_stream_start(ar, addr); if (ret) return ret; if (unaligned_bytes) { /* copy the last word into a zero padded buffer */ memcpy(&last_word, &buf[last_word_offset], unaligned_bytes); } ret = ath6kl_bmi_lz_data(ar, buf, last_word_offset); if (ret) return ret; if (unaligned_bytes) ret = ath6kl_bmi_lz_data(ar, (u8 *)&last_word, 4); if (!ret) { /* Close compressed stream and open a new (fake) one. * This serves mainly to flush Target caches. */ ret = ath6kl_bmi_lz_stream_start(ar, 0x00); } return ret; } void ath6kl_bmi_reset(struct ath6kl *ar) { ar->bmi.done_sent = false; } int ath6kl_bmi_init(struct ath6kl *ar) { if (WARN_ON(ar->bmi.max_data_size == 0)) return -EINVAL; /* cmd + addr + len + data_size */ ar->bmi.max_cmd_size = ar->bmi.max_data_size + (sizeof(u32) * 3); ar->bmi.cmd_buf = kzalloc(ar->bmi.max_cmd_size, GFP_KERNEL); if (!ar->bmi.cmd_buf) return -ENOMEM; return 0; } void ath6kl_bmi_cleanup(struct ath6kl *ar) { kfree(ar->bmi.cmd_buf); ar->bmi.cmd_buf = NULL; } |
| 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2017 Netronome Systems, Inc. * Copyright (C) 2019 Mellanox Technologies. All rights reserved */ #include <linux/completion.h> #include <linux/device.h> #include <linux/idr.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/sysfs.h> #include "netdevsim.h" static DEFINE_IDA(nsim_bus_dev_ids); static LIST_HEAD(nsim_bus_dev_list); static DEFINE_MUTEX(nsim_bus_dev_list_lock); static bool nsim_bus_enable; static refcount_t nsim_bus_devs; /* Including the bus itself. */ static DECLARE_COMPLETION(nsim_bus_devs_released); static struct nsim_bus_dev *to_nsim_bus_dev(struct device *dev) { return container_of(dev, struct nsim_bus_dev, dev); } static ssize_t nsim_bus_dev_numvfs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int num_vfs; int ret; ret = kstrtouint(buf, 0, &num_vfs); if (ret) return ret; device_lock(dev); ret = -ENOENT; if (dev_get_drvdata(dev)) ret = nsim_drv_configure_vfs(nsim_bus_dev, num_vfs); device_unlock(dev); return ret ? ret : count; } static ssize_t nsim_bus_dev_numvfs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return sprintf(buf, "%u\n", nsim_bus_dev->num_vfs); } static struct device_attribute nsim_bus_dev_numvfs_attr = __ATTR(sriov_numvfs, 0664, nsim_bus_dev_numvfs_show, nsim_bus_dev_numvfs_store); static ssize_t new_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); u8 eth_addr[ETH_ALEN] = {}; unsigned int port_index; bool addr_set = false; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = sscanf(buf, "%u %hhx:%hhx:%hhx:%hhx:%hhx:%hhx", &port_index, ð_addr[0], ð_addr[1], ð_addr[2], ð_addr[3], ð_addr[4], ð_addr[5]); switch (ret) { case 7: if (!is_valid_ether_addr(eth_addr)) { pr_err("The supplied perm_addr is not a valid MAC address\n"); return -EINVAL; } addr_set = true; fallthrough; case 1: break; default: pr_err("Format for adding new port is \"id [perm_addr]\" (uint MAC).\n"); return -EINVAL; } ret = nsim_drv_port_add(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index, addr_set ? eth_addr : NULL); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_new_port_attr = __ATTR_WO(new_port); static ssize_t del_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int port_index; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = kstrtouint(buf, 0, &port_index); if (ret) return ret; ret = nsim_drv_port_del(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_del_port_attr = __ATTR_WO(del_port); static struct attribute *nsim_bus_dev_attrs[] = { &nsim_bus_dev_numvfs_attr.attr, &nsim_bus_dev_new_port_attr.attr, &nsim_bus_dev_del_port_attr.attr, NULL, }; static const struct attribute_group nsim_bus_dev_attr_group = { .attrs = nsim_bus_dev_attrs, }; static const struct attribute_group *nsim_bus_dev_attr_groups[] = { &nsim_bus_dev_attr_group, NULL, }; static void nsim_bus_dev_release(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev; nsim_bus_dev = container_of(dev, struct nsim_bus_dev, dev); kfree(nsim_bus_dev); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); } static const struct device_type nsim_bus_dev_type = { .groups = nsim_bus_dev_attr_groups, .release = nsim_bus_dev_release, }; static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues); static ssize_t new_device_store(const struct bus_type *bus, const char *buf, size_t count) { unsigned int id, port_count, num_queues; struct nsim_bus_dev *nsim_bus_dev; int err; err = sscanf(buf, "%u %u %u", &id, &port_count, &num_queues); switch (err) { case 1: port_count = 1; fallthrough; case 2: num_queues = 1; fallthrough; case 3: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for adding new device is \"id port_count num_queues\" (uint uint unit).\n"); return -EINVAL; } mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { err = -EBUSY; goto err; } nsim_bus_dev = nsim_bus_dev_new(id, port_count, num_queues); if (IS_ERR(nsim_bus_dev)) { err = PTR_ERR(nsim_bus_dev); goto err; } refcount_inc(&nsim_bus_devs); /* Allow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, true); list_add_tail(&nsim_bus_dev->list, &nsim_bus_dev_list); mutex_unlock(&nsim_bus_dev_list_lock); return count; err: mutex_unlock(&nsim_bus_dev_list_lock); return err; } static BUS_ATTR_WO(new_device); static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev); static ssize_t del_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev, *tmp; unsigned int id; int err; err = sscanf(buf, "%u", &id); switch (err) { case 1: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for deleting device is \"id\" (uint).\n"); return -EINVAL; } err = -ENOENT; mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { mutex_unlock(&nsim_bus_dev_list_lock); return -EBUSY; } list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { if (nsim_bus_dev->dev.id != id) continue; list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); err = 0; break; } mutex_unlock(&nsim_bus_dev_list_lock); return !err ? count : err; } static BUS_ATTR_WO(del_device); static ssize_t link_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct netdevsim *nsim_a, *nsim_b, *peer; struct net_device *dev_a, *dev_b; unsigned int ifidx_a, ifidx_b; int netnsfd_a, netnsfd_b, err; struct net *ns_a, *ns_b; err = sscanf(buf, "%d:%u %d:%u", &netnsfd_a, &ifidx_a, &netnsfd_b, &ifidx_b); if (err != 4) { pr_err("Format for linking two devices is \"netnsfd_a:ifidx_a netnsfd_b:ifidx_b\" (int uint int uint).\n"); return -EINVAL; } ns_a = get_net_ns_by_fd(netnsfd_a); if (IS_ERR(ns_a)) { pr_err("Could not find netns with fd: %d\n", netnsfd_a); return -EINVAL; } ns_b = get_net_ns_by_fd(netnsfd_b); if (IS_ERR(ns_b)) { pr_err("Could not find netns with fd: %d\n", netnsfd_b); put_net(ns_a); return -EINVAL; } err = -EINVAL; rtnl_lock(); dev_a = __dev_get_by_index(ns_a, ifidx_a); if (!dev_a) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx_a, netnsfd_a); goto out_err; } if (!netdev_is_nsim(dev_a)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx_a, netnsfd_a); goto out_err; } dev_b = __dev_get_by_index(ns_b, ifidx_b); if (!dev_b) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx_b, netnsfd_b); goto out_err; } if (!netdev_is_nsim(dev_b)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx_b, netnsfd_b); goto out_err; } if (dev_a == dev_b) { pr_err("Cannot link a netdevsim to itself\n"); goto out_err; } err = -EBUSY; nsim_a = netdev_priv(dev_a); peer = rtnl_dereference(nsim_a->peer); if (peer) { pr_err("Netdevsim %d:%u is already linked\n", netnsfd_a, ifidx_a); goto out_err; } nsim_b = netdev_priv(dev_b); peer = rtnl_dereference(nsim_b->peer); if (peer) { pr_err("Netdevsim %d:%u is already linked\n", netnsfd_b, ifidx_b); goto out_err; } err = 0; rcu_assign_pointer(nsim_a->peer, nsim_b); rcu_assign_pointer(nsim_b->peer, nsim_a); if (netif_running(dev_a) && netif_running(dev_b)) { netif_carrier_on(dev_a); netif_carrier_on(dev_b); } out_err: put_net(ns_b); put_net(ns_a); rtnl_unlock(); return !err ? count : err; } static BUS_ATTR_WO(link_device); static ssize_t unlink_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct netdevsim *nsim, *peer; struct net_device *dev; unsigned int ifidx; int netnsfd, err; struct net *ns; err = sscanf(buf, "%u:%u", &netnsfd, &ifidx); if (err != 2) { pr_err("Format for unlinking a device is \"netnsfd:ifidx\" (int uint).\n"); return -EINVAL; } ns = get_net_ns_by_fd(netnsfd); if (IS_ERR(ns)) { pr_err("Could not find netns with fd: %d\n", netnsfd); return -EINVAL; } err = -EINVAL; rtnl_lock(); dev = __dev_get_by_index(ns, ifidx); if (!dev) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx, netnsfd); goto out_put_netns; } if (!netdev_is_nsim(dev)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx, netnsfd); goto out_put_netns; } nsim = netdev_priv(dev); peer = rtnl_dereference(nsim->peer); if (!peer) goto out_put_netns; netif_carrier_off(dev); netif_carrier_off(peer->netdev); err = 0; RCU_INIT_POINTER(nsim->peer, NULL); RCU_INIT_POINTER(peer->peer, NULL); synchronize_net(); netif_tx_wake_all_queues(dev); netif_tx_wake_all_queues(peer->netdev); out_put_netns: put_net(ns); rtnl_unlock(); return !err ? count : err; } static BUS_ATTR_WO(unlink_device); static struct attribute *nsim_bus_attrs[] = { &bus_attr_new_device.attr, &bus_attr_del_device.attr, &bus_attr_link_device.attr, &bus_attr_unlink_device.attr, NULL }; ATTRIBUTE_GROUPS(nsim_bus); static int nsim_bus_probe(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_drv_probe(nsim_bus_dev); } static void nsim_bus_remove(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); nsim_drv_remove(nsim_bus_dev); } static int nsim_num_vf(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_bus_dev->num_vfs; } static const struct bus_type nsim_bus = { .name = DRV_NAME, .dev_name = DRV_NAME, .bus_groups = nsim_bus_groups, .probe = nsim_bus_probe, .remove = nsim_bus_remove, .num_vf = nsim_num_vf, }; #define NSIM_BUS_DEV_MAX_VFS 4 static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues) { struct nsim_bus_dev *nsim_bus_dev; int err; nsim_bus_dev = kzalloc(sizeof(*nsim_bus_dev), GFP_KERNEL); if (!nsim_bus_dev) return ERR_PTR(-ENOMEM); err = ida_alloc_range(&nsim_bus_dev_ids, id, id, GFP_KERNEL); if (err < 0) goto err_nsim_bus_dev_free; nsim_bus_dev->dev.id = err; nsim_bus_dev->dev.bus = &nsim_bus; nsim_bus_dev->dev.type = &nsim_bus_dev_type; nsim_bus_dev->port_count = port_count; nsim_bus_dev->num_queues = num_queues; nsim_bus_dev->initial_net = current->nsproxy->net_ns; nsim_bus_dev->max_vfs = NSIM_BUS_DEV_MAX_VFS; /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); err = device_register(&nsim_bus_dev->dev); if (err) goto err_nsim_bus_dev_id_free; return nsim_bus_dev; err_nsim_bus_dev_id_free: ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); put_device(&nsim_bus_dev->dev); nsim_bus_dev = NULL; err_nsim_bus_dev_free: kfree(nsim_bus_dev); return ERR_PTR(err); } static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev) { /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); device_unregister(&nsim_bus_dev->dev); } static struct device_driver nsim_driver = { .name = DRV_NAME, .bus = &nsim_bus, .owner = THIS_MODULE, }; int nsim_bus_init(void) { int err; err = bus_register(&nsim_bus); if (err) return err; err = driver_register(&nsim_driver); if (err) goto err_bus_unregister; refcount_set(&nsim_bus_devs, 1); /* Allow using resources */ smp_store_release(&nsim_bus_enable, true); return 0; err_bus_unregister: bus_unregister(&nsim_bus); return err; } void nsim_bus_exit(void) { struct nsim_bus_dev *nsim_bus_dev, *tmp; /* Disallow using resources */ smp_store_release(&nsim_bus_enable, false); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); mutex_lock(&nsim_bus_dev_list_lock); list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); } mutex_unlock(&nsim_bus_dev_list_lock); wait_for_completion(&nsim_bus_devs_released); driver_unregister(&nsim_driver); bus_unregister(&nsim_bus); } |
| 4 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/string.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/caif/cfpkt.h> #define PKT_PREFIX 48 #define PKT_POSTFIX 2 #define PKT_LEN_WHEN_EXTENDING 128 #define PKT_ERROR(pkt, errmsg) \ do { \ cfpkt_priv(pkt)->erronous = true; \ skb_reset_tail_pointer(&pkt->skb); \ pr_warn(errmsg); \ } while (0) /* * net/caif/ is generic and does not * understand SKB, so we do this typecast */ struct cfpkt { struct sk_buff skb; }; /* Private data inside SKB */ struct cfpkt_priv_data { struct dev_info dev_info; bool erronous; }; static inline struct cfpkt_priv_data *cfpkt_priv(struct cfpkt *pkt) { return (struct cfpkt_priv_data *) pkt->skb.cb; } static inline bool is_erronous(struct cfpkt *pkt) { return cfpkt_priv(pkt)->erronous; } static inline struct sk_buff *pkt_to_skb(struct cfpkt *pkt) { return &pkt->skb; } static inline struct cfpkt *skb_to_pkt(struct sk_buff *skb) { return (struct cfpkt *) skb; } struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt) { struct cfpkt *pkt = skb_to_pkt(nativepkt); cfpkt_priv(pkt)->erronous = false; return pkt; } EXPORT_SYMBOL(cfpkt_fromnative); void *cfpkt_tonative(struct cfpkt *pkt) { return (void *) pkt; } EXPORT_SYMBOL(cfpkt_tonative); static struct cfpkt *cfpkt_create_pfx(u16 len, u16 pfx) { struct sk_buff *skb; skb = alloc_skb(len + pfx, GFP_ATOMIC); if (unlikely(skb == NULL)) return NULL; skb_reserve(skb, pfx); return skb_to_pkt(skb); } inline struct cfpkt *cfpkt_create(u16 len) { return cfpkt_create_pfx(len + PKT_POSTFIX, PKT_PREFIX); } void cfpkt_destroy(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); kfree_skb(skb); } inline bool cfpkt_more(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); return skb->len > 0; } int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); if (skb_headlen(skb) >= len) { memcpy(data, skb->data, len); return 0; } return !cfpkt_extr_head(pkt, data, len) && !cfpkt_add_head(pkt, data, len); } int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); u8 *from; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(len > skb->len)) { PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } if (unlikely(len > skb_headlen(skb))) { if (unlikely(skb_linearize(skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } } from = skb_pull(skb, len); from -= len; if (data) memcpy(data, from, len); return 0; } EXPORT_SYMBOL(cfpkt_extr_head); int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); u8 *data = dta; u8 *from; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_linearize(skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } if (unlikely(skb->data + len > skb_tail_pointer(skb))) { PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } from = skb_tail_pointer(skb) - len; skb_trim(skb, skb->len - len); memcpy(data, from, len); return 0; } int cfpkt_pad_trail(struct cfpkt *pkt, u16 len) { return cfpkt_add_body(pkt, NULL, len); } int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); struct sk_buff *lastskb; u8 *to; u16 addlen = 0; if (unlikely(is_erronous(pkt))) return -EPROTO; lastskb = skb; /* Check whether we need to add space at the tail */ if (unlikely(skb_tailroom(skb) < len)) { if (likely(len < PKT_LEN_WHEN_EXTENDING)) addlen = PKT_LEN_WHEN_EXTENDING; else addlen = len; } /* Check whether we need to change the SKB before writing to the tail */ if (unlikely((addlen > 0) || skb_cloned(skb) || skb_shared(skb))) { /* Make sure data is writable */ if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) { PKT_ERROR(pkt, "cow failed\n"); return -EPROTO; } } /* All set to put the last SKB and optionally write data there. */ to = pskb_put(skb, lastskb, len); if (likely(data)) memcpy(to, data, len); return 0; } inline int cfpkt_addbdy(struct cfpkt *pkt, u8 data) { return cfpkt_add_body(pkt, &data, 1); } int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); struct sk_buff *lastskb; u8 *to; const u8 *data = data2; int ret; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_headroom(skb) < len)) { PKT_ERROR(pkt, "no headroom\n"); return -EPROTO; } /* Make sure data is writable */ ret = skb_cow_data(skb, 0, &lastskb); if (unlikely(ret < 0)) { PKT_ERROR(pkt, "cow failed\n"); return ret; } to = skb_push(skb, len); memcpy(to, data, len); return 0; } EXPORT_SYMBOL(cfpkt_add_head); inline int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len) { return cfpkt_add_body(pkt, data, len); } inline u16 cfpkt_getlen(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); return skb->len; } int cfpkt_iterate(struct cfpkt *pkt, u16 (*iter_func)(u16, void *, u16), u16 data) { /* * Don't care about the performance hit of linearizing, * Checksum should not be used on high-speed interfaces anyway. */ if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_linearize(&pkt->skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt)); } int cfpkt_setlen(struct cfpkt *pkt, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); if (unlikely(is_erronous(pkt))) return -EPROTO; if (likely(len <= skb->len)) { if (unlikely(skb->data_len)) ___pskb_trim(skb, len); else skb_trim(skb, len); return cfpkt_getlen(pkt); } /* Need to expand SKB */ if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len))) PKT_ERROR(pkt, "skb_pad_trail failed\n"); return cfpkt_getlen(pkt); } struct cfpkt *cfpkt_append(struct cfpkt *dstpkt, struct cfpkt *addpkt, u16 expectlen) { struct sk_buff *dst = pkt_to_skb(dstpkt); struct sk_buff *add = pkt_to_skb(addpkt); u16 addlen = skb_headlen(add); u16 neededtailspace; struct sk_buff *tmp; u16 dstlen; u16 createlen; if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) { return dstpkt; } neededtailspace = max(expectlen, addlen); if (dst->tail + neededtailspace > dst->end) { /* Create a dumplicate of 'dst' with more tail space */ struct cfpkt *tmppkt; dstlen = skb_headlen(dst); createlen = dstlen + neededtailspace; tmppkt = cfpkt_create(createlen + PKT_PREFIX + PKT_POSTFIX); if (tmppkt == NULL) return NULL; tmp = pkt_to_skb(tmppkt); skb_put_data(tmp, dst->data, dstlen); cfpkt_destroy(dstpkt); dst = tmp; } skb_put_data(dst, add->data, skb_headlen(add)); cfpkt_destroy(addpkt); return skb_to_pkt(dst); } struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos) { struct sk_buff *skb2; struct sk_buff *skb = pkt_to_skb(pkt); struct cfpkt *tmppkt; u8 *split = skb->data + pos; u16 len2nd = skb_tail_pointer(skb) - split; if (unlikely(is_erronous(pkt))) return NULL; if (skb->data + pos > skb_tail_pointer(skb)) { PKT_ERROR(pkt, "trying to split beyond end of packet\n"); return NULL; } /* Create a new packet for the second part of the data */ tmppkt = cfpkt_create_pfx(len2nd + PKT_PREFIX + PKT_POSTFIX, PKT_PREFIX); if (tmppkt == NULL) return NULL; skb2 = pkt_to_skb(tmppkt); if (skb2 == NULL) return NULL; skb_put_data(skb2, split, len2nd); /* Reduce the length of the original packet */ skb_trim(skb, pos); skb2->priority = skb->priority; return skb_to_pkt(skb2); } bool cfpkt_erroneous(struct cfpkt *pkt) { return cfpkt_priv(pkt)->erronous; } struct caif_payload_info *cfpkt_info(struct cfpkt *pkt) { return (struct caif_payload_info *)&pkt_to_skb(pkt)->cb; } EXPORT_SYMBOL(cfpkt_info); void cfpkt_set_prio(struct cfpkt *pkt, int prio) { pkt_to_skb(pkt)->priority = prio; } EXPORT_SYMBOL(cfpkt_set_prio); |
| 1 2 2 2 2 2 1 1 1 2 2 2 2 1 2 2 2 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 | // SPDX-License-Identifier: GPL-2.0+ /* * Comedi driver for CIO-DAS16/M1 * Author: Frank Mori Hess, based on code from the das16 driver. * Copyright (C) 2001 Frank Mori Hess <fmhess@users.sourceforge.net> * * COMEDI - Linux Control and Measurement Device Interface * Copyright (C) 2000 David A. Schleef <ds@schleef.org> */ /* * Driver: das16m1 * Description: CIO-DAS16/M1 * Author: Frank Mori Hess <fmhess@users.sourceforge.net> * Devices: [Measurement Computing] CIO-DAS16/M1 (das16m1) * Status: works * * This driver supports a single board - the CIO-DAS16/M1. As far as I know, * there are no other boards that have the same register layout. Even the * CIO-DAS16/M1/16 is significantly different. * * I was _barely_ able to reach the full 1 MHz capability of this board, using * a hard real-time interrupt (set the TRIG_RT flag in your struct comedi_cmd * and use rtlinux or RTAI). The board can't do dma, so the bottleneck is * pulling the data across the ISA bus. I timed the interrupt handler, and it * took my computer ~470 microseconds to pull 512 samples from the board. So * at 1 Mhz sampling rate, expect your CPU to be spending almost all of its * time in the interrupt handler. * * This board has some unusual restrictions for its channel/gain list. If the * list has 2 or more channels in it, then two conditions must be satisfied: * (1) - even/odd channels must appear at even/odd indices in the list * (2) - the list must have an even number of entries. * * Configuration options: * [0] - base io address * [1] - irq (optional, but you probably want it) * * irq can be omitted, although the cmd interface will not work without it. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/comedi/comedidev.h> #include <linux/comedi/comedi_8255.h> #include <linux/comedi/comedi_8254.h> /* * Register map (dev->iobase) */ #define DAS16M1_AI_REG 0x00 /* 16-bit register */ #define DAS16M1_AI_TO_CHAN(x) (((x) >> 0) & 0xf) #define DAS16M1_AI_TO_SAMPLE(x) (((x) >> 4) & 0xfff) #define DAS16M1_CS_REG 0x02 #define DAS16M1_CS_EXT_TRIG BIT(0) #define DAS16M1_CS_OVRUN BIT(5) #define DAS16M1_CS_IRQDATA BIT(7) #define DAS16M1_DI_REG 0x03 #define DAS16M1_DO_REG 0x03 #define DAS16M1_CLR_INTR_REG 0x04 #define DAS16M1_INTR_CTRL_REG 0x05 #define DAS16M1_INTR_CTRL_PACER(x) (((x) & 0x3) << 0) #define DAS16M1_INTR_CTRL_PACER_EXT DAS16M1_INTR_CTRL_PACER(2) #define DAS16M1_INTR_CTRL_PACER_INT DAS16M1_INTR_CTRL_PACER(3) #define DAS16M1_INTR_CTRL_PACER_MASK DAS16M1_INTR_CTRL_PACER(3) #define DAS16M1_INTR_CTRL_IRQ(x) (((x) & 0x7) << 4) #define DAS16M1_INTR_CTRL_INTE BIT(7) #define DAS16M1_Q_ADDR_REG 0x06 #define DAS16M1_Q_REG 0x07 #define DAS16M1_Q_CHAN(x) (((x) & 0x7) << 0) #define DAS16M1_Q_RANGE(x) (((x) & 0xf) << 4) #define DAS16M1_8254_IOBASE1 0x08 #define DAS16M1_8254_IOBASE2 0x0c #define DAS16M1_8255_IOBASE 0x400 #define DAS16M1_8254_IOBASE3 0x404 #define DAS16M1_SIZE2 0x08 #define DAS16M1_AI_FIFO_SZ 1024 /* # samples */ static const struct comedi_lrange range_das16m1 = { 9, { BIP_RANGE(5), BIP_RANGE(2.5), BIP_RANGE(1.25), BIP_RANGE(0.625), UNI_RANGE(10), UNI_RANGE(5), UNI_RANGE(2.5), UNI_RANGE(1.25), BIP_RANGE(10) } }; struct das16m1_private { struct comedi_8254 *counter; unsigned int intr_ctrl; unsigned int adc_count; u16 initial_hw_count; unsigned short ai_buffer[DAS16M1_AI_FIFO_SZ]; unsigned long extra_iobase; }; static void das16m1_ai_set_queue(struct comedi_device *dev, unsigned int *chanspec, unsigned int len) { unsigned int i; for (i = 0; i < len; i++) { unsigned int chan = CR_CHAN(chanspec[i]); unsigned int range = CR_RANGE(chanspec[i]); outb(i, dev->iobase + DAS16M1_Q_ADDR_REG); outb(DAS16M1_Q_CHAN(chan) | DAS16M1_Q_RANGE(range), dev->iobase + DAS16M1_Q_REG); } } static void das16m1_ai_munge(struct comedi_device *dev, struct comedi_subdevice *s, void *data, unsigned int num_bytes, unsigned int start_chan_index) { unsigned short *array = data; unsigned int nsamples = comedi_bytes_to_samples(s, num_bytes); unsigned int i; /* * The fifo values have the channel number in the lower 4-bits and * the sample in the upper 12-bits. This just shifts the values * to remove the channel numbers. */ for (i = 0; i < nsamples; i++) array[i] = DAS16M1_AI_TO_SAMPLE(array[i]); } static int das16m1_ai_check_chanlist(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_cmd *cmd) { int i; if (cmd->chanlist_len == 1) return 0; if ((cmd->chanlist_len % 2) != 0) { dev_dbg(dev->class_dev, "chanlist must be of even length or length 1\n"); return -EINVAL; } for (i = 0; i < cmd->chanlist_len; i++) { unsigned int chan = CR_CHAN(cmd->chanlist[i]); if ((i % 2) != (chan % 2)) { dev_dbg(dev->class_dev, "even/odd channels must go have even/odd chanlist indices\n"); return -EINVAL; } } return 0; } static int das16m1_ai_cmdtest(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_cmd *cmd) { int err = 0; /* Step 1 : check if triggers are trivially valid */ err |= comedi_check_trigger_src(&cmd->start_src, TRIG_NOW | TRIG_EXT); err |= comedi_check_trigger_src(&cmd->scan_begin_src, TRIG_FOLLOW); err |= comedi_check_trigger_src(&cmd->convert_src, TRIG_TIMER | TRIG_EXT); err |= comedi_check_trigger_src(&cmd->scan_end_src, TRIG_COUNT); err |= comedi_check_trigger_src(&cmd->stop_src, TRIG_COUNT | TRIG_NONE); if (err) return 1; /* Step 2a : make sure trigger sources are unique */ err |= comedi_check_trigger_is_unique(cmd->start_src); err |= comedi_check_trigger_is_unique(cmd->convert_src); err |= comedi_check_trigger_is_unique(cmd->stop_src); /* Step 2b : and mutually compatible */ if (err) return 2; /* Step 3: check if arguments are trivially valid */ err |= comedi_check_trigger_arg_is(&cmd->start_arg, 0); if (cmd->scan_begin_src == TRIG_FOLLOW) /* internal trigger */ err |= comedi_check_trigger_arg_is(&cmd->scan_begin_arg, 0); if (cmd->convert_src == TRIG_TIMER) err |= comedi_check_trigger_arg_min(&cmd->convert_arg, 1000); err |= comedi_check_trigger_arg_is(&cmd->scan_end_arg, cmd->chanlist_len); if (cmd->stop_src == TRIG_COUNT) err |= comedi_check_trigger_arg_min(&cmd->stop_arg, 1); else /* TRIG_NONE */ err |= comedi_check_trigger_arg_is(&cmd->stop_arg, 0); if (err) return 3; /* step 4: fix up arguments */ if (cmd->convert_src == TRIG_TIMER) { unsigned int arg = cmd->convert_arg; comedi_8254_cascade_ns_to_timer(dev->pacer, &arg, cmd->flags); err |= comedi_check_trigger_arg_is(&cmd->convert_arg, arg); } if (err) return 4; /* Step 5: check channel list if it exists */ if (cmd->chanlist && cmd->chanlist_len > 0) err |= das16m1_ai_check_chanlist(dev, s, cmd); if (err) return 5; return 0; } static int das16m1_ai_cmd(struct comedi_device *dev, struct comedi_subdevice *s) { struct das16m1_private *devpriv = dev->private; struct comedi_async *async = s->async; struct comedi_cmd *cmd = &async->cmd; unsigned int byte; /* set software count */ devpriv->adc_count = 0; /* * Initialize lower half of hardware counter, used to determine how * many samples are in fifo. Value doesn't actually load into counter * until counter's next clock (the next a/d conversion). */ comedi_8254_set_mode(devpriv->counter, 1, I8254_MODE2 | I8254_BINARY); comedi_8254_write(devpriv->counter, 1, 0); /* * Remember current reading of counter so we know when counter has * actually been loaded. */ devpriv->initial_hw_count = comedi_8254_read(devpriv->counter, 1); das16m1_ai_set_queue(dev, cmd->chanlist, cmd->chanlist_len); /* enable interrupts and set internal pacer counter mode and counts */ devpriv->intr_ctrl &= ~DAS16M1_INTR_CTRL_PACER_MASK; if (cmd->convert_src == TRIG_TIMER) { comedi_8254_update_divisors(dev->pacer); comedi_8254_pacer_enable(dev->pacer, 1, 2, true); devpriv->intr_ctrl |= DAS16M1_INTR_CTRL_PACER_INT; } else { /* TRIG_EXT */ devpriv->intr_ctrl |= DAS16M1_INTR_CTRL_PACER_EXT; } /* set control & status register */ byte = 0; /* * If we are using external start trigger (also board dislikes having * both start and conversion triggers external simultaneously). */ if (cmd->start_src == TRIG_EXT && cmd->convert_src != TRIG_EXT) byte |= DAS16M1_CS_EXT_TRIG; outb(byte, dev->iobase + DAS16M1_CS_REG); /* clear interrupt */ outb(0, dev->iobase + DAS16M1_CLR_INTR_REG); devpriv->intr_ctrl |= DAS16M1_INTR_CTRL_INTE; outb(devpriv->intr_ctrl, dev->iobase + DAS16M1_INTR_CTRL_REG); return 0; } static int das16m1_ai_cancel(struct comedi_device *dev, struct comedi_subdevice *s) { struct das16m1_private *devpriv = dev->private; /* disable interrupts and pacer */ devpriv->intr_ctrl &= ~(DAS16M1_INTR_CTRL_INTE | DAS16M1_INTR_CTRL_PACER_MASK); outb(devpriv->intr_ctrl, dev->iobase + DAS16M1_INTR_CTRL_REG); return 0; } static int das16m1_ai_eoc(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned long context) { unsigned int status; status = inb(dev->iobase + DAS16M1_CS_REG); if (status & DAS16M1_CS_IRQDATA) return 0; return -EBUSY; } static int das16m1_ai_insn_read(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { int ret; int i; das16m1_ai_set_queue(dev, &insn->chanspec, 1); for (i = 0; i < insn->n; i++) { unsigned short val; /* clear interrupt */ outb(0, dev->iobase + DAS16M1_CLR_INTR_REG); /* trigger conversion */ outb(0, dev->iobase + DAS16M1_AI_REG); ret = comedi_timeout(dev, s, insn, das16m1_ai_eoc, 0); if (ret) return ret; val = inw(dev->iobase + DAS16M1_AI_REG); data[i] = DAS16M1_AI_TO_SAMPLE(val); } return insn->n; } static int das16m1_di_insn_bits(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { data[1] = inb(dev->iobase + DAS16M1_DI_REG) & 0xf; return insn->n; } static int das16m1_do_insn_bits(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { if (comedi_dio_update_state(s, data)) outb(s->state, dev->iobase + DAS16M1_DO_REG); data[1] = s->state; return insn->n; } static void das16m1_handler(struct comedi_device *dev, unsigned int status) { struct das16m1_private *devpriv = dev->private; struct comedi_subdevice *s = dev->read_subdev; struct comedi_async *async = s->async; struct comedi_cmd *cmd = &async->cmd; u16 num_samples; u16 hw_counter; /* figure out how many samples are in fifo */ hw_counter = comedi_8254_read(devpriv->counter, 1); /* * Make sure hardware counter reading is not bogus due to initial * value not having been loaded yet. */ if (devpriv->adc_count == 0 && hw_counter == devpriv->initial_hw_count) { num_samples = 0; } else { /* * The calculation of num_samples looks odd, but it uses the * following facts. 16 bit hardware counter is initialized with * value of zero (which really means 0x1000). The counter * decrements by one on each conversion (when the counter * decrements from zero it goes to 0xffff). num_samples is a * 16 bit variable, so it will roll over in a similar fashion * to the hardware counter. Work it out, and this is what you * get. */ num_samples = -hw_counter - devpriv->adc_count; } /* check if we only need some of the points */ if (cmd->stop_src == TRIG_COUNT) { if (num_samples > cmd->stop_arg * cmd->chanlist_len) num_samples = cmd->stop_arg * cmd->chanlist_len; } /* make sure we don't try to get too many points if fifo has overrun */ if (num_samples > DAS16M1_AI_FIFO_SZ) num_samples = DAS16M1_AI_FIFO_SZ; insw(dev->iobase, devpriv->ai_buffer, num_samples); comedi_buf_write_samples(s, devpriv->ai_buffer, num_samples); devpriv->adc_count += num_samples; if (cmd->stop_src == TRIG_COUNT) { if (devpriv->adc_count >= cmd->stop_arg * cmd->chanlist_len) { /* end of acquisition */ async->events |= COMEDI_CB_EOA; } } /* * This probably won't catch overruns since the card doesn't generate * overrun interrupts, but we might as well try. */ if (status & DAS16M1_CS_OVRUN) { async->events |= COMEDI_CB_ERROR; dev_err(dev->class_dev, "fifo overflow\n"); } comedi_handle_events(dev, s); } static int das16m1_ai_poll(struct comedi_device *dev, struct comedi_subdevice *s) { unsigned long flags; unsigned int status; /* prevent race with interrupt handler */ spin_lock_irqsave(&dev->spinlock, flags); status = inb(dev->iobase + DAS16M1_CS_REG); das16m1_handler(dev, status); spin_unlock_irqrestore(&dev->spinlock, flags); return comedi_buf_n_bytes_ready(s); } static irqreturn_t das16m1_interrupt(int irq, void *d) { int status; struct comedi_device *dev = d; if (!dev->attached) { dev_err(dev->class_dev, "premature interrupt\n"); return IRQ_HANDLED; } /* prevent race with comedi_poll() */ spin_lock(&dev->spinlock); status = inb(dev->iobase + DAS16M1_CS_REG); if ((status & (DAS16M1_CS_IRQDATA | DAS16M1_CS_OVRUN)) == 0) { dev_err(dev->class_dev, "spurious interrupt\n"); spin_unlock(&dev->spinlock); return IRQ_NONE; } das16m1_handler(dev, status); /* clear interrupt */ outb(0, dev->iobase + DAS16M1_CLR_INTR_REG); spin_unlock(&dev->spinlock); return IRQ_HANDLED; } static int das16m1_irq_bits(unsigned int irq) { switch (irq) { case 10: return 0x0; case 11: return 0x1; case 12: return 0x2; case 15: return 0x3; case 2: return 0x4; case 3: return 0x5; case 5: return 0x6; case 7: return 0x7; default: return 0x0; } } static int das16m1_attach(struct comedi_device *dev, struct comedi_devconfig *it) { struct das16m1_private *devpriv; struct comedi_subdevice *s; int ret; devpriv = comedi_alloc_devpriv(dev, sizeof(*devpriv)); if (!devpriv) return -ENOMEM; ret = comedi_request_region(dev, it->options[0], 0x10); if (ret) return ret; /* Request an additional region for the 8255 and 3rd 8254 */ ret = __comedi_request_region(dev, dev->iobase + DAS16M1_8255_IOBASE, DAS16M1_SIZE2); if (ret) return ret; devpriv->extra_iobase = dev->iobase + DAS16M1_8255_IOBASE; /* only irqs 2, 3, 4, 5, 6, 7, 10, 11, 12, 14, and 15 are valid */ if (it->options[1] >= 2 && it->options[1] <= 15 && (1 << it->options[1]) & 0xdcfc) { ret = request_irq(it->options[1], das16m1_interrupt, 0, dev->board_name, dev); if (ret == 0) dev->irq = it->options[1]; } dev->pacer = comedi_8254_io_alloc(dev->iobase + DAS16M1_8254_IOBASE2, I8254_OSC_BASE_10MHZ, I8254_IO8, 0); if (IS_ERR(dev->pacer)) return PTR_ERR(dev->pacer); devpriv->counter = comedi_8254_io_alloc(dev->iobase + DAS16M1_8254_IOBASE1, 0, I8254_IO8, 0); if (IS_ERR(devpriv->counter)) return PTR_ERR(devpriv->counter); ret = comedi_alloc_subdevices(dev, 4); if (ret) return ret; /* Analog Input subdevice */ s = &dev->subdevices[0]; s->type = COMEDI_SUBD_AI; s->subdev_flags = SDF_READABLE | SDF_DIFF; s->n_chan = 8; s->maxdata = 0x0fff; s->range_table = &range_das16m1; s->insn_read = das16m1_ai_insn_read; if (dev->irq) { dev->read_subdev = s; s->subdev_flags |= SDF_CMD_READ; s->len_chanlist = 256; s->do_cmdtest = das16m1_ai_cmdtest; s->do_cmd = das16m1_ai_cmd; s->cancel = das16m1_ai_cancel; s->poll = das16m1_ai_poll; s->munge = das16m1_ai_munge; } /* Digital Input subdevice */ s = &dev->subdevices[1]; s->type = COMEDI_SUBD_DI; s->subdev_flags = SDF_READABLE; s->n_chan = 4; s->maxdata = 1; s->range_table = &range_digital; s->insn_bits = das16m1_di_insn_bits; /* Digital Output subdevice */ s = &dev->subdevices[2]; s->type = COMEDI_SUBD_DO; s->subdev_flags = SDF_WRITABLE; s->n_chan = 4; s->maxdata = 1; s->range_table = &range_digital; s->insn_bits = das16m1_do_insn_bits; /* Digital I/O subdevice (8255) */ s = &dev->subdevices[3]; ret = subdev_8255_io_init(dev, s, DAS16M1_8255_IOBASE); if (ret) return ret; /* initialize digital output lines */ outb(0, dev->iobase + DAS16M1_DO_REG); /* set the interrupt level */ devpriv->intr_ctrl = DAS16M1_INTR_CTRL_IRQ(das16m1_irq_bits(dev->irq)); outb(devpriv->intr_ctrl, dev->iobase + DAS16M1_INTR_CTRL_REG); return 0; } static void das16m1_detach(struct comedi_device *dev) { struct das16m1_private *devpriv = dev->private; if (devpriv) { if (devpriv->extra_iobase) release_region(devpriv->extra_iobase, DAS16M1_SIZE2); if (!IS_ERR(devpriv->counter)) kfree(devpriv->counter); } comedi_legacy_detach(dev); } static struct comedi_driver das16m1_driver = { .driver_name = "das16m1", .module = THIS_MODULE, .attach = das16m1_attach, .detach = das16m1_detach, }; module_comedi_driver(das16m1_driver); MODULE_AUTHOR("Comedi https://www.comedi.org"); MODULE_DESCRIPTION("Comedi driver for CIO-DAS16/M1 ISA cards"); MODULE_LICENSE("GPL"); |
| 62 80 18 19 80 80 80 79 80 80 103 66 103 5 79 68 13 10 22 24 22 24 1 2 82 18 18 15 18 14 13 12 2 12 12 8 7 6 2 5 5 4 3 1 1 1 4 3 2 1 18 11 170 171 171 39 40 40 40 40 123 109 124 84 83 83 77 65 65 2 2 65 79 80 83 83 80 77 74 74 73 74 74 62 3 6 8 8 8 8 8 8 8 8 2 2 14 8 6 2 8 8 6 14 6 6 9 9 9 9 9 6 3 2 9 9 2 5 4 2 2 2 2 2 2 2 2 1 3 5 48 47 46 41 36 8 7 7 5 43 1 9 9 9 5 9 4 9 6 4 3 3 3 3 3 3 6 3 4 2 2 3 3 3 2 5 2 5 2 2 2 3 1 3 7 4 3 2 7 7 4 4 3 4 5 5 3 5 1 3 1 1 1 1 1 2 1 3 3 3 3 3 3 4 4 4 2 2 2 4 4 6 5 1 6 3 7 1 7 6 5 4 3 7 3 1 3 3 44 33 20 20 44 37 36 4 33 3 39 43 43 1 43 41 34 33 32 32 32 7 1 26 14 6 6 40 26 5 2 1 1 2 5 200 196 195 193 189 202 2 200 10 202 196 4 4 4 4 3 1 19 19 158 156 154 154 148 21 21 17 17 6 3 1 147 4 143 11 1 1 5 5 3 4 1 3 3 4 5 5 2 3 3 3 2 5 6 6 5 6 5 3 5 4 2 2 2 8 7 3 5 24 2 24 2 24 12 24 5 4 24 7 2 21 7 28 28 28 1 27 2 23 18 21 2 2 2 2 2 28 12 28 23 28 25 13 13 25 3 3 1 24 9 3 3 3 5 12 4 1 24 24 24 9 24 12 1 23 12 7 1 24 9 10 8 3 7 3 1 3 3 7 7 3 5 2 5 5 1 7 8 12 7 12 |